/***********************************************************************/
/*                                                                     */
/*   svm_learn.c                                                       */
/*                                                                     */
/*   Learning module of Support Vector Machine.                        */
/*                                                                     */
/*   Author: Thorsten Joachims                                         */
/*   Date: 31.10.05                                                    */
/*                                                                     */
/*   Copyright (c) 2005  Thorsten Joachims - All rights reserved       */
/*                                                                     */
/*   This software is available for non-commercial use only. It must   */
/*   not be modified and distributed without prior permission of the   */
/*   author. The author is not responsible for implications from the   */
/*   use of this software.                                             */
/*                                                                     */
/***********************************************************************/


# include "svm_common.h"
# include "svm_learn.h"

#define MAX(x,y)      ((x) < (y) ? (y) : (x))
#define MIN(x,y)      ((x) > (y) ? (y) : (x))
#define SIGN(x)       ((x) > (0) ? (1) : (((x) < (0) ? (-1) : (0))))

/* interface to QP-solver */
double *optimize_qp(QP *, double *, long, double *, LEARN_PARM *);
void reset_hideo_globals();

/*---------------------------------------------------------------------------*/

/* Learns an SVM classification model based on the training data in
docs/label. The resulting model is returned in the structure
model. */

void svm_learn_classification(DOC **docs, double *classlabel, long int
							  totdoc, long int totwords, 
							  LEARN_PARM *learn_parm, 
							  KERNEL_PARM *kernel_parm, 
							  KERNEL_CACHE *kernel_cache, 
							  MODEL *model,
							  double *alpha)
							  /* docs:        Training vectors (x-part) */
							  /* class:       Training labels (y-part, zero if test example for
							  transduction) */
							  /* totdoc:      Number of examples in docs/label */
							  /* totwords:    Number of features (i.e. highest feature index) */
							  /* learn_parm:  Learning paramenters */
							  /* kernel_parm: Kernel paramenters */
							  /* kernel_cache:Initialized Cache of size totdoc, if using a kernel. 
							  NULL if linear.*/
							  /* model:       Returns learning result (assumed empty before called) */
							  /* alpha:       Start values for the alpha variables or NULL
							  pointer. The new alpha values are returned after 
							  optimization if not NULL. Array must be of size totdoc. */
{
	long *inconsistent,i,*label;
	long inconsistentnum;
	long misclassified,upsupvecnum;
	double loss,model_length,example_length;
	double dualitygap,xisum,alphasum,xi;
	double maxdiff,*lin,*a,*c;
	double runtime_start,runtime_end;
	long iterations;
	long *unlabeled,transduction;
	long heldout;
	long loo_count=0,loo_count_pos=0,loo_count_neg=0,trainpos=0,trainneg=0;
	long loocomputed=0;
	double runtime_start_loo=0,runtime_start_xa=0;
	double heldout_c=0,r_delta_sq=0,r_delta,r_delta_avg;
	long *index,*index2dnum;
	double *weights;
	CFLOAT *aicache;  /* buffer to keep one row of hessian */

	double *xi_fullset; /* buffer for storing xi on full sample in loo */
	double *a_fullset;  /* buffer for storing alpha on full sample in loo */
	TIMING timing_profile;
	SHRINK_STATE shrink_state;

	runtime_start=get_runtime();
	timing_profile.time_kernel=0;
	timing_profile.time_opti=0;
	timing_profile.time_shrink=0;
	timing_profile.time_update=0;
	timing_profile.time_model=0;
	timing_profile.time_check=0;
	timing_profile.time_select=0;
	kernel_cache_statistic=0;

	learn_parm->totwords=totwords;

	/* make sure -n value is reasonable */
	if((learn_parm->svm_newvarsinqp < 2) 
		|| (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
			learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
	}

	init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);

	label = (long *)my_malloc(sizeof(long)*totdoc);
	inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
	unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
	c = (double *)my_malloc(sizeof(double)*totdoc);
	a = (double *)my_malloc(sizeof(double)*totdoc);
	a_fullset = (double *)my_malloc(sizeof(double)*totdoc);
	xi_fullset = (double *)my_malloc(sizeof(double)*totdoc);
	lin = (double *)my_malloc(sizeof(double)*totdoc);
	learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
	model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
	model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
	model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));

	model->at_upper_bound=0;
	model->b=0;	       
	model->supvec[0]=0;  /* element 0 reserved and empty for now */
	model->alpha[0]=0;
	model->lin_weights=NULL;
	model->totwords=totwords;
	model->totdoc=totdoc;
	model->kernel_parm=(*kernel_parm);
	model->sv_num=1;
	model->loo_error=-1;
	model->loo_recall=-1;
	model->loo_precision=-1;
	model->xa_error=-1;
	model->xa_recall=-1;
	model->xa_precision=-1;
	inconsistentnum=0;
	transduction=0;

	r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
	r_delta_sq=r_delta*r_delta;

	r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
	if(learn_parm->svm_c == 0.0) {  /* default value for C */
		learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
		if(verbosity>=1) 
			printf("Setting default regularization parameter C=%.4f\n",
			learn_parm->svm_c);
	}

	learn_parm->eps=-1.0;      /* equivalent regression epsilon for
							   classification */

	for(i=0;i<totdoc;i++) {    /* various inits */
		docs[i]->docnum=i;
		inconsistent[i]=0;
		a[i]=0;
		lin[i]=0;
		c[i]=0.0;
		unlabeled[i]=0;
		if(classlabel[i] == 0) {
			unlabeled[i]=1;
			label[i]=0;
			transduction=1;
		}
		if(classlabel[i] > 0) {
			learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
				docs[i]->costfactor;
			label[i]=1;
			trainpos++;
		}
		else if(classlabel[i] < 0) {
			learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor;
			label[i]=-1;
			trainneg++;
		}
		else {
			learn_parm->svm_cost[i]=0;
		}
	}
	if(verbosity>=2) {
		printf("%ld positive, %ld negative, and %ld unlabeled examples.\n",trainpos,trainneg,totdoc-trainpos-trainneg); fflush(stdout);
	}

	/* caching makes no sense for linear kernel */
	if(kernel_parm->kernel_type == LINEAR) {
		/* kernel_cache = NULL; */
	} 

	/* compute starting state for initial alpha values */
	if(alpha) {
		if(verbosity>=1) {
			printf("Computing starting state..."); fflush(stdout);
		}
		index = (long *)my_malloc(sizeof(long)*totdoc);
		index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
		weights=(double *)my_malloc(sizeof(double)*(totwords+1));
		aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
		for(i=0;i<totdoc;i++) {    /* create full index and clip alphas */
			index[i]=1;
			alpha[i]=fabs(alpha[i]);
			if(alpha[i]<0) alpha[i]=0;
			if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i];
		}
		if(kernel_cache && (kernel_parm->kernel_type != LINEAR)) {
			for(i=0;i<totdoc;i++)     /* fill kernel cache with unbounded SV */
				if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i]) 
					&& (kernel_cache_space_available(kernel_cache))) 
					cache_kernel_row(kernel_cache,docs,i,kernel_parm);
			for(i=0;i<totdoc;i++)     /* fill rest of kernel cache with bounded SV */
				if((alpha[i]==learn_parm->svm_cost[i]) 
					&& (kernel_cache_space_available(kernel_cache))) 
					cache_kernel_row(kernel_cache,docs,i,kernel_parm);
		}
		clear_nvector(weights,totwords); /* set weights to zero */
		(void)compute_index(index,totdoc,index2dnum);
		update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc,
			totwords,kernel_parm,kernel_cache,lin,aicache,
			weights);
		(void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c,
			learn_parm,index2dnum,index2dnum,model);
		for(i=0;i<totdoc;i++) {    /* copy initial alphas */
			a[i]=alpha[i];
		}
		free(index);
		free(index2dnum);
		free(weights);
		free(aicache);
		if(verbosity>=1) {
			printf("done.\n");  fflush(stdout);
		}   
	} 

	if(transduction) {
		learn_parm->svm_iter_to_shrink=99999999;
		if(verbosity >= 1)
			printf("\nDeactivating Shrinking due to an incompatibility with the transductive \nlearner in the current version.\n\n");
	}

	if(transduction && learn_parm->compute_loo) {
		learn_parm->compute_loo=0;
		if(verbosity >= 1)
			printf("\nCannot compute leave-one-out estimates for transductive learner.\n\n");
	}    

	if(learn_parm->remove_inconsistent && learn_parm->compute_loo) {
		learn_parm->compute_loo=0;
		printf("\nCannot compute leave-one-out estimates when removing inconsistent examples.\n\n");
	}    

	if(learn_parm->compute_loo && ((trainpos == 1) || (trainneg == 1))) {
		learn_parm->compute_loo=0;
		printf("\nCannot compute leave-one-out with only one example in one class.\n\n");
	}    


	if(verbosity==1) {
		printf("Optimizing"); fflush(stdout);
	}

	/* train the svm */
	iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
		kernel_parm,kernel_cache,&shrink_state,model,
		inconsistent,unlabeled,a,lin,
		c,&timing_profile,
		&maxdiff,(long)-1,
		(long)1);

	if(verbosity>=1) {
		if(verbosity==1) printf("done. (%ld iterations)\n",iterations);

		misclassified=0;
		for(i=0;(i<totdoc);i++) { /* get final statistic */
			if((lin[i]-model->b)*(double)label[i] <= 0.0) 
				misclassified++;
		}

		printf("Optimization finished (%ld misclassified, maxdiff=%.5f).\n",
			misclassified,maxdiff); 

		runtime_end=get_runtime();
		if(verbosity>=2) {
			printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
				(runtime_end-runtime_start)/100.0,
				(100.0*timing_profile.time_kernel)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_opti)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_shrink)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_update)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_model)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_check)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_select)/(runtime_end-runtime_start));
		}
		else {
			printf("Runtime in cpu-seconds: %.2f\n",
				(runtime_end-runtime_start)/100.0);
		}

		if(learn_parm->remove_inconsistent) {	  
			inconsistentnum=0;
			for(i=0;i<totdoc;i++) 
				if(inconsistent[i]) 
					inconsistentnum++;
			printf("Number of SV: %ld (plus %ld inconsistent examples)\n",
				model->sv_num-1,inconsistentnum);
		}
		else {
			upsupvecnum=0;
			for(i=1;i<model->sv_num;i++) {
				if(fabs(model->alpha[i]) >= 
					(learn_parm->svm_cost[(model->supvec[i])->docnum]-
					learn_parm->epsilon_a)) 
					upsupvecnum++;
			}
			printf("Number of SV: %ld (including %ld at upper bound)\n",
				model->sv_num-1,upsupvecnum);
		}

		if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
			loss=0;
			xisum=0;
			alphasum=0;
			model_length=0; 
			for(i=0;i<totdoc;i++) {
				xi=MAX(0.0,1.0-(lin[i]-model->b)*(double)label[i]);
				if(xi > learn_parm->epsilon_crit)
					loss+=xi;
				xisum+=xi*learn_parm->svm_cost[i];
				alphasum+=a[i];
				model_length+=a[i]*label[i]*lin[i];
			}
			model_length=sqrt(model_length);
			dualitygap=(0.5*model_length*model_length+xisum)
				-(alphasum-0.5*model_length*model_length);
			fprintf(stdout,"Upper bound on duality gap: gap=%.5f\n",dualitygap);
			fprintf(stdout,"Dual objective value: dval=%.5f\n",
				alphasum-0.5*model_length*model_length);
			fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
			fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length);
			example_length=estimate_sphere(model); 
			fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n",
				length_of_longest_document_vector(docs,totdoc,kernel_parm));
			fprintf(stdout,"Estimated VCdim of classifier: VCdim<=%.5f\n",
				estimate_margin_vcdim(model,model_length,example_length));
			if((!learn_parm->remove_inconsistent) && (!transduction)) {
				runtime_start_xa=get_runtime();
				if(verbosity>=1) {
					printf("Computing XiAlpha-estimates..."); fflush(stdout);
				}
				compute_xa_estimates(model,label,unlabeled,totdoc,docs,lin,a,
					kernel_parm,learn_parm,&(model->xa_error),
					&(model->xa_recall),&(model->xa_precision));
				if(verbosity>=1) {
					printf("done\n");
				}
				printf("Runtime for XiAlpha-estimates in cpu-seconds: %.2f\n",
					(get_runtime()-runtime_start_xa)/100.0);

				fprintf(stdout,"XiAlpha-estimate of the error: error<=%.2f%% (rho=%.2f,depth=%ld)\n",
					model->xa_error,learn_parm->rho,learn_parm->xa_depth);
				fprintf(stdout,"XiAlpha-estimate of the recall: recall=>%.2f%% (rho=%.2f,depth=%ld)\n",
					model->xa_recall,learn_parm->rho,learn_parm->xa_depth);
				fprintf(stdout,"XiAlpha-estimate of the precision: precision=>%.2f%% (rho=%.2f,depth=%ld)\n",
					model->xa_precision,learn_parm->rho,learn_parm->xa_depth);
			}
			else if(!learn_parm->remove_inconsistent) {
				estimate_transduction_quality(model,label,unlabeled,totdoc,docs,lin);
			}
		}
		if(verbosity>=1) {
			printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic);
		}
	}


	/* leave-one-out testing starts now */
	if(learn_parm->compute_loo) {
		/* save results of training on full dataset for leave-one-out */
		runtime_start_loo=get_runtime();
		for(i=0;i<totdoc;i++) {
			xi_fullset[i]=1.0-((lin[i]-model->b)*(double)label[i]);
			if(xi_fullset[i]<0) xi_fullset[i]=0;
			a_fullset[i]=a[i];
		}
		if(verbosity>=1) {
			printf("Computing leave-one-out");
		}

		/* repeat this loop for every held-out example */
		for(heldout=0;(heldout<totdoc);heldout++) {
			if(learn_parm->rho*a_fullset[heldout]*r_delta_sq+xi_fullset[heldout]
			< 1.0) { 
				/* guaranteed to not produce a leave-one-out error */
				if(verbosity==1) {
					printf("+"); fflush(stdout); 
				}
			}
			else if(xi_fullset[heldout] > 1.0) {
				/* guaranteed to produce a leave-one-out error */
				loo_count++;
				if(label[heldout] > 0)  loo_count_pos++; else loo_count_neg++;
				if(verbosity==1) {
					printf("-"); fflush(stdout); 
				}
			}
			else {
				loocomputed++;
				heldout_c=learn_parm->svm_cost[heldout]; /* set upper bound to zero */
				learn_parm->svm_cost[heldout]=0;
				/* make sure heldout example is not currently  */
				/* shrunk away. Assumes that lin is up to date! */
				shrink_state.active[heldout]=1;  
				if(verbosity>=2) 
					printf("\nLeave-One-Out test on example %ld\n",heldout);
				if(verbosity>=1) {
					printf("(?[%ld]",heldout); fflush(stdout); 
				}

				optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
					kernel_parm,
					kernel_cache,&shrink_state,model,inconsistent,unlabeled,
					a,lin,c,&timing_profile,
					&maxdiff,heldout,(long)2);

				/* printf("%.20f\n",(lin[heldout]-model->b)*(double)label[heldout]); */

				if(((lin[heldout]-model->b)*(double)label[heldout]) <= 0.0) { 
					loo_count++;                            /* there was a loo-error */
					if(label[heldout] > 0)  loo_count_pos++; else loo_count_neg++;
					if(verbosity>=1) {
						printf("-)"); fflush(stdout); 
					}
				}
				else {
					if(verbosity>=1) {
						printf("+)"); fflush(stdout); 
					}
				}
				/* now we need to restore the original data set*/
				learn_parm->svm_cost[heldout]=heldout_c; /* restore upper bound */
			}
		} /* end of leave-one-out loop */


		if(verbosity>=1) {
			printf("\nRetrain on full problem"); fflush(stdout); 
		}
		optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
			kernel_parm,
			kernel_cache,&shrink_state,model,inconsistent,unlabeled,
			a,lin,c,&timing_profile,
			&maxdiff,(long)-1,(long)1);
		if(verbosity >= 1) 
			printf("done.\n");


		/* after all leave-one-out computed */
		model->loo_error=100.0*loo_count/(double)totdoc;
		model->loo_recall=(1.0-(double)loo_count_pos/(double)trainpos)*100.0;
		model->loo_precision=(trainpos-loo_count_pos)/
			(double)(trainpos-loo_count_pos+loo_count_neg)*100.0;
		if(verbosity >= 1) {
			fprintf(stdout,"Leave-one-out estimate of the error: error=%.2f%%\n",
				model->loo_error);
			fprintf(stdout,"Leave-one-out estimate of the recall: recall=%.2f%%\n",
				model->loo_recall);
			fprintf(stdout,"Leave-one-out estimate of the precision: precision=%.2f%%\n",
				model->loo_precision);
			fprintf(stdout,"Actual leave-one-outs computed:  %ld (rho=%.2f)\n",
				loocomputed,learn_parm->rho);
			printf("Runtime for leave-one-out in cpu-seconds: %.2f\n",
				(get_runtime()-runtime_start_loo)/100.0);
		}
	}

	if(learn_parm->alphafile[0])
		write_alphas(learn_parm->alphafile,a,label,totdoc);

	shrink_state_cleanup(&shrink_state);
	free(label);
	free(inconsistent);
	free(unlabeled);
	free(c);
	free(a);
	free(a_fullset);
	free(xi_fullset);
	free(lin);
	free(learn_parm->svm_cost);
}


/* Learns an SVM regression model based on the training data in
docs/label. The resulting model is returned in the structure
model. */

void svm_learn_regression(DOC **docs, double *value, long int totdoc, 
						  long int totwords, LEARN_PARM *learn_parm, 
						  KERNEL_PARM *kernel_parm, 
						  KERNEL_CACHE **kernel_cache, MODEL *model)
						  /* docs:        Training vectors (x-part) */
						  /* class:       Training value (y-part) */
						  /* totdoc:      Number of examples in docs/label */
						  /* totwords:    Number of features (i.e. highest feature index) */
						  /* learn_parm:  Learning paramenters */
						  /* kernel_parm: Kernel paramenters */
						  /* kernel_cache:Initialized Cache, if using a kernel. NULL if
						  linear. Note that it will be free'd and reassigned */
						  /* model:       Returns learning result (assumed empty before called) */
{
	long *inconsistent,i,j;
	long inconsistentnum;
	long upsupvecnum;
	double loss,model_length,example_length;
	double maxdiff,*lin,*a,*c;
	double runtime_start,runtime_end;
	long iterations,kernel_cache_size;
	long *unlabeled;
	double r_delta_sq=0,r_delta,r_delta_avg;
	double *xi_fullset; /* buffer for storing xi on full sample in loo */
	double *a_fullset;  /* buffer for storing alpha on full sample in loo */
	TIMING timing_profile;
	SHRINK_STATE shrink_state;
	DOC **docs_org;
	long *label;

	/* set up regression problem in standard form */
	docs_org=docs;
	docs = (DOC **)my_malloc(sizeof(DOC)*2*totdoc);
	label = (long *)my_malloc(sizeof(long)*2*totdoc);
	c = (double *)my_malloc(sizeof(double)*2*totdoc);
	for(i=0;i<totdoc;i++) {   
		j=2*totdoc-1-i;
		docs[i]=create_example(i,0,0,docs_org[i]->costfactor,docs_org[i]->fvec);
		label[i]=+1;
		c[i]=value[i];
		docs[j]=create_example(j,0,0,docs_org[i]->costfactor,docs_org[i]->fvec);
		label[j]=-1;
		c[j]=value[i];
	}
	totdoc*=2;

	/* need to get a bigger kernel cache */
	if(*kernel_cache) {
		kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024);
		kernel_cache_cleanup(*kernel_cache);
		(*kernel_cache)=kernel_cache_init(totdoc,kernel_cache_size);
	}

	runtime_start=get_runtime();
	timing_profile.time_kernel=0;
	timing_profile.time_opti=0;
	timing_profile.time_shrink=0;
	timing_profile.time_update=0;
	timing_profile.time_model=0;
	timing_profile.time_check=0;
	timing_profile.time_select=0;
	kernel_cache_statistic=0;

	learn_parm->totwords=totwords;

	/* make sure -n value is reasonable */
	if((learn_parm->svm_newvarsinqp < 2) 
		|| (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
			learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
	}

	init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);

	inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
	unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
	a = (double *)my_malloc(sizeof(double)*totdoc);
	a_fullset = (double *)my_malloc(sizeof(double)*totdoc);
	xi_fullset = (double *)my_malloc(sizeof(double)*totdoc);
	lin = (double *)my_malloc(sizeof(double)*totdoc);
	learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
	model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
	model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
	model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));

	model->at_upper_bound=0;
	model->b=0;	       
	model->supvec[0]=0;  /* element 0 reserved and empty for now */
	model->alpha[0]=0;
	model->lin_weights=NULL;
	model->totwords=totwords;
	model->totdoc=totdoc;
	model->kernel_parm=(*kernel_parm);
	model->sv_num=1;
	model->loo_error=-1;
	model->loo_recall=-1;
	model->loo_precision=-1;
	model->xa_error=-1;
	model->xa_recall=-1;
	model->xa_precision=-1;
	inconsistentnum=0;

	r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
	r_delta_sq=r_delta*r_delta;

	r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
	if(learn_parm->svm_c == 0.0) {  /* default value for C */
		learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
		if(verbosity>=1) 
			printf("Setting default regularization parameter C=%.4f\n",
			learn_parm->svm_c);
	}

	for(i=0;i<totdoc;i++) {    /* various inits */
		inconsistent[i]=0;
		a[i]=0;
		lin[i]=0;
		unlabeled[i]=0;
		if(label[i] > 0) {
			learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
				docs[i]->costfactor;
		}
		else if(label[i] < 0) {
			learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor;
		}
	}

	/* caching makes no sense for linear kernel */
	if((kernel_parm->kernel_type == LINEAR) && (*kernel_cache)) {
		printf("WARNING: Using a kernel cache for linear case will slow optimization down!\n");
	} 

	if(verbosity==1) {
		printf("Optimizing"); fflush(stdout);
	}

	/* train the svm */
	iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
		kernel_parm,*kernel_cache,&shrink_state,
		model,inconsistent,unlabeled,a,lin,c,
		&timing_profile,&maxdiff,(long)-1,
		(long)1);

	if(verbosity>=1) {
		if(verbosity==1) printf("done. (%ld iterations)\n",iterations);

		printf("Optimization finished (maxdiff=%.5f).\n",maxdiff); 

		runtime_end=get_runtime();
		if(verbosity>=2) {
			printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
				(runtime_end-runtime_start)/100.0,
				(100.0*timing_profile.time_kernel)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_opti)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_shrink)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_update)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_model)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_check)/(runtime_end-runtime_start),
				(100.0*timing_profile.time_select)/(runtime_end-runtime_start));
		}
		else {
			printf("Runtime in cpu-seconds: %.2f\n",
				(runtime_end-runtime_start)/100.0);
		}

		if(learn_parm->remove_inconsistent) {	  
			inconsistentnum=0;
			for(i=0;i<totdoc;i++) 
				if(inconsistent[i]) 
					inconsistentnum++;
			printf("Number of SV: %ld (plus %ld inconsistent examples)\n",
				model->sv_num-1,inconsistentnum);
		}
		else {
			upsupvecnum=0;
			for(i=1;i<model->sv_num;i++) {
				if(fabs(model->alpha[i]) >= 
					(learn_parm->svm_cost[(model->supvec[i])->docnum]-
					learn_parm->epsilon_a)) 
					upsupvecnum++;
			}
			printf("Number of SV: %ld (including %ld at upper bound)\n",
				model->sv_num-1,upsupvecnum);
		}

		if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
			loss=0;
			model_length=0; 
			for(i=0;i<totdoc;i++) {
				if((lin[i]-model->b)*(double)label[i] < (-learn_parm->eps+(double)label[i]*c[i])-learn_parm->epsilon_crit)
					loss+=-learn_parm->eps+(double)label[i]*c[i]-(lin[i]-model->b)*(double)label[i];
				model_length+=a[i]*label[i]*lin[i];
			}
			model_length=sqrt(model_length);
			fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
			fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length);
			example_length=estimate_sphere(model); 
			fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n",
				length_of_longest_document_vector(docs,totdoc,kernel_parm));
		}
		if(verbosity>=1) {
			printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic);
		}
	}

	if(learn_parm->alphafile[0])
		write_alphas(learn_parm->alphafile,a,label,totdoc);

	/* this makes sure the model we return does not contain pointers to the 
	temporary documents */
	for(i=1;i<model->sv_num;i++) { 
		j=model->supvec[i]->docnum;
		if(j >= (totdoc/2)) {
			j=totdoc-j-1;
		}
		model->supvec[i]=docs_org[j];
	}

	shrink_state_cleanup(&shrink_state);
	for(i=0;i<totdoc;i++)
		free_example(docs[i],0);
	free(docs);
	free(label);
	free(inconsistent);
	free(unlabeled);
	free(c);
	free(a);
	free(a_fullset);
	free(xi_fullset);
	free(lin);
	free(learn_parm->svm_cost);
}

void svm_learn_ranking(DOC **docs, double *rankvalue, long int totdoc, 
					   long int totwords, LEARN_PARM *learn_parm, 
					   KERNEL_PARM *kernel_parm, KERNEL_CACHE **kernel_cache, 
					   MODEL *model)
					   /* docs:        Training vectors (x-part) */
					   /* rankvalue:   Training target values that determine the ranking */
					   /* totdoc:      Number of examples in docs/label */
					   /* totwords:    Number of features (i.e. highest feature index) */
					   /* learn_parm:  Learning paramenters */
					   /* kernel_parm: Kernel paramenters */
					   /* kernel_cache:Initialized pointer to Cache of size 1*totdoc, if 
					   using a kernel. NULL if linear. NOTE: Cache is 
					   getting reinitialized in this function */
					   /* model:       Returns learning result (assumed empty before called) */
{
	DOC **docdiff;
	long i,j,k,totpair,kernel_cache_size;
	double *target,*alpha,cost;
	long *greater,*lesser;
	MODEL *pairmodel;
	SVECTOR *flow,*fhigh;

	totpair=0;
	for(i=0;i<totdoc;i++) {
		for(j=i+1;j<totdoc;j++) {
			if((docs[i]->queryid==docs[j]->queryid) && (rankvalue[i] != rankvalue[j])) {
				totpair++;
			}
		}
	}

	printf("Constructing %ld rank constraints...",totpair); fflush(stdout);
	docdiff=(DOC **)my_malloc(sizeof(DOC)*totpair);
	target=(double *)my_malloc(sizeof(double)*totpair); 
	greater=(long *)my_malloc(sizeof(long)*totpair); 
	lesser=(long *)my_malloc(sizeof(long)*totpair); 

	k=0;
	for(i=0;i<totdoc;i++) {
		for(j=i+1;j<totdoc;j++) {
			if(docs[i]->queryid == docs[j]->queryid) {
				/* "Hijacked" costfactor to input rhs of constraints */
				/* cost=(docs[i]->costfactor+docs[j]->costfactor)/2.0; */
				cost=1;
				if(rankvalue[i] > rankvalue[j]) {
					if(kernel_parm->kernel_type == LINEAR)
						docdiff[k]=create_example(k,0,0,cost,
						sub_ss(docs[i]->fvec,docs[j]->fvec));
					else {
						flow=copy_svector(docs[j]->fvec);
						flow->factor=-1.0;
						flow->next=NULL;
						fhigh=copy_svector(docs[i]->fvec);
						fhigh->factor=1.0;
						fhigh->next=flow;
						docdiff[k]=create_example(k,0,0,cost,fhigh);
					}
					target[k]=1+docs[i]->costfactor-docs[j]->costfactor;
					greater[k]=i;
					lesser[k]=j;
					k++;
				}
				else if(rankvalue[i] < rankvalue[j]) {
					if(kernel_parm->kernel_type == LINEAR)
						docdiff[k]=create_example(k,0,0,cost,
						sub_ss(docs[j]->fvec,docs[i]->fvec));
					else {
						flow=copy_svector(docs[j]->fvec);
						flow->factor=1.0;
						flow->next=NULL;
						fhigh=copy_svector(docs[i]->fvec);
						fhigh->factor=-1.0;
						fhigh->next=flow;
						docdiff[k]=create_example(k,0,0,cost,fhigh);
					}
					target[k]=1+docs[j]->costfactor-docs[i]->costfactor;
					greater[k]=j;
					lesser[k]=i;
					k++;
				}
			}
		}
	}
	printf("done.\n"); fflush(stdout);

	/* need to get a bigger kernel cache */
	if(*kernel_cache) {
		kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024);
		kernel_cache_cleanup(*kernel_cache);
		(*kernel_cache)=kernel_cache_init(totpair,kernel_cache_size);
	}

	/* must use unbiased hyperplane on difference vectors */
	learn_parm->biased_hyperplane=0;
	pairmodel=(MODEL *)my_malloc(sizeof(MODEL));
	svm_learn_optimization(docdiff,target,totpair,totwords,learn_parm,
		kernel_parm,(*kernel_cache),pairmodel,NULL);

	/* Transfer the result into a more compact model. If you would like
	to output the original model on pairs of documents, see below. */
	alpha=(double *)my_malloc(sizeof(double)*totdoc); 
	for(i=0;i<totdoc;i++) {
		alpha[i]=0;
	}
	for(i=1;i<pairmodel->sv_num;i++) {
		alpha[lesser[(pairmodel->supvec[i])->docnum]]-=pairmodel->alpha[i];
		alpha[greater[(pairmodel->supvec[i])->docnum]]+=pairmodel->alpha[i];
	}
	model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
	model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
	model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));
	model->supvec[0]=0;  /* element 0 reserved and empty for now */
	model->alpha[0]=0;
	model->sv_num=1;
	for(i=0;i<totdoc;i++) {
		if(alpha[i]) {
			model->supvec[model->sv_num]=docs[i];
			model->alpha[model->sv_num]=alpha[i];
			model->index[i]=model->sv_num;
			model->sv_num++;
		}
		else {
			model->index[i]=-1;
		}
	}
	model->at_upper_bound=0;
	model->b=0;	       
	model->lin_weights=NULL;
	model->totwords=totwords;
	model->totdoc=totdoc;
	model->kernel_parm=(*kernel_parm);
	model->loo_error=-1;
	model->loo_recall=-1;
	model->loo_precision=-1;
	model->xa_error=-1;
	model->xa_recall=-1;
	model->xa_precision=-1;

	free(alpha);
	free(greater);
	free(lesser);
	free(target);

	/* If you would like to output the original model on pairs of
	document, replace the following lines with '(*model)=(*pairmodel);' */
	for(i=0;i<totpair;i++)
		free_example(docdiff[i],1);
	free(docdiff);
	free_model(pairmodel,0);
}


/* The following solves a freely defined and given set of
inequalities. The optimization problem is of the following form:

min 0.5 w*w + C sum_i C_i \xi_i
s.t. x_i * w > rhs_i - \xi_i

This corresponds to the -z o option. */

void svm_learn_optimization(DOC **docs, double *rhs, long int
							totdoc, long int totwords, 
							LEARN_PARM *learn_parm, 
							KERNEL_PARM *kernel_parm, 
							KERNEL_CACHE *kernel_cache, MODEL *model,
							double *alpha)
							/* docs:        Left-hand side of inequalities (x-part) */
							/* rhs:         Right-hand side of inequalities */
							/* totdoc:      Number of examples in docs/label */
							/* totwords:    Number of features (i.e. highest feature index) */
							/* learn_parm:  Learning paramenters */
							/* kernel_parm: Kernel paramenters */
							/* kernel_cache:Initialized Cache of size 1*totdoc, if using a kernel. 
							NULL if linear.*/
							/* model:       Returns solution as SV expansion (assumed empty before called) */
							/* alpha:       Start values for the alpha variables or NULL
							pointer. The new alpha values are returned after 
							optimization if not NULL. Array must be of size totdoc. */
{
	long i,*label;
	long misclassified,upsupvecnum;
	double loss,model_length,alphasum,example_length;
	double maxdiff,*lin,*a,*c;
	double runtime_start,runtime_end;
	long iterations,maxslackid,svsetnum;
	long *unlabeled,*inconsistent;
	double r_delta_avg;
	long *index,*index2dnum;
	double *weights,*slack,*alphaslack;
	CFLOAT *aicache;  /* buffer to keep one row of hessian */

	TIMING timing_profile;
	SHRINK_STATE shrink_state;

	runtime_start=get_runtime();
	timing_profile.time_kernel=0;
	timing_profile.time_opti=0;
	timing_profile.time_shrink=0;
	timing_profile.time_update=0;
	timing_profile.time_model=0;
	timing_profile.time_check=0;
	timing_profile.time_select=0;
	kernel_cache_statistic=0;

	learn_parm->totwords=totwords;

	/* make sure -n value is reasonable */
	if((learn_parm->svm_newvarsinqp < 2) 
		|| (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
			learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
	}

	init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);

	label = (long *)my_malloc(sizeof(long)*totdoc);
	unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
	inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
	c = (double *)my_malloc(sizeof(double)*totdoc);
	a = (double *)my_malloc(sizeof(double)*totdoc);
	lin = (double *)my_malloc(sizeof(double)*totdoc);
	learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
	model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
	model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
	model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));

	model->at_upper_bound=0;
	model->b=0;	       
	model->supvec[0]=0;  /* element 0 reserved and empty for now */
	model->alpha[0]=0;
	model->lin_weights=NULL;
	model->totwords=totwords;
	model->totdoc=totdoc;
	model->kernel_parm=(*kernel_parm);
	model->sv_num=1;
	model->loo_error=-1;
	model->loo_recall=-1;
	model->loo_precision=-1;
	model->xa_error=-1;
	model->xa_recall=-1;
	model->xa_precision=-1;

	r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
	if(learn_parm->svm_c == 0.0) {  /* default value for C */
		learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
		if(verbosity>=1) 
			printf("Setting default regularization parameter C=%.4f\n",
			learn_parm->svm_c);
	}

	learn_parm->biased_hyperplane=0; /* learn an unbiased hyperplane */

	learn_parm->eps=0.0;      /* No margin, unless explicitly handcoded
							  in the right-hand side in the training
							  set.  */

	for(i=0;i<totdoc;i++) {    /* various inits */
		double dtmpC = docs[i]->costfactor;
		docs[i]->docnum=i;
		a[i]=0;
		lin[i]=0;
		c[i]=rhs[i];       /* set right-hand side */
		unlabeled[i]=0;
		inconsistent[i]=0;
		learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
			docs[i]->costfactor;
		label[i]=1;
	}
	if(learn_parm->sharedslack) /* if shared slacks are used, they must */
		for(i=0;i<totdoc;i++)     /*  be used on every constraint */
			if(!docs[i]->slackid) {
				perror("Error: Missing shared slacks definitions in some of the examples.");
				exit(0);
			}

			/* print kernel matrix */
			/*
			int j;
			for(i=0;i<totdoc;i++) {
			printf("\n");
			for(j=0;j<totdoc;j++) {
			printf("%.4f\t",kernel(kernel_parm,docs[i],docs[j]));
			}
			printf("\n");
			}
			*/

			/* compute starting state for initial alpha values */
			if(alpha) {
				if(verbosity>=1) {
					printf("Computing starting state..."); fflush(stdout);
				}
				index = (long *)my_malloc(sizeof(long)*totdoc);
				index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
				if(kernel_parm->kernel_type == LINEAR) {
					weights=(double *)my_malloc(sizeof(double)*(totwords+1));
					clear_nvector(weights,totwords); /* set weights to zero */
					aicache=NULL;
				}
				else {
					weights=NULL;
					aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
				}
				for(i=0;i<totdoc;i++) {    /* create full index and clip alphas */
					index[i]=1;
					alpha[i]=fabs(alpha[i]);
					if(alpha[i]<0) alpha[i]=0;
					if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i];
				}
				if(kernel_cache && (kernel_parm->kernel_type != LINEAR)) {
					for(i=0;i<totdoc;i++)     /* fill kernel cache with unbounded SV */
						if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i]) 
							&& (kernel_cache_space_available(kernel_cache))) 
							cache_kernel_row(kernel_cache,docs,i,kernel_parm);
					for(i=0;i<totdoc;i++)     /* fill rest of kernel cache with bounded SV */
						if((alpha[i]==learn_parm->svm_cost[i]) 
							&& (kernel_cache_space_available(kernel_cache))) 
							cache_kernel_row(kernel_cache,docs,i,kernel_parm);
				}
				(void)compute_index(index,totdoc,index2dnum);
				update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc,
					totwords,kernel_parm,kernel_cache,lin,aicache,
					weights);
				(void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c,
					learn_parm,index2dnum,index2dnum,model);
				for(i=0;i<totdoc;i++) {    /* copy initial alphas */
					a[i]=alpha[i];
				}
				free(index);
				free(index2dnum);
				if(weights) free(weights);
				if(aicache) free(aicache);
				if(verbosity>=1) {
					printf("done.\n");  fflush(stdout);
				}   
			} 

			/* removing inconsistent does not work for general optimization problem */
			if(learn_parm->remove_inconsistent) {	  
				learn_parm->remove_inconsistent = 0;
				printf("'remove inconsistent' not available in this mode. Switching option off!"); fflush(stdout);
			}

			/* caching makes no sense for linear kernel */
			if(kernel_parm->kernel_type == LINEAR) {
				/* kernel_cache = NULL; */
			} 

			if(verbosity==1) {
				printf("Optimizing"); fflush(stdout);
			}

			/* train the svm */
			if(learn_parm->sharedslack)
				iterations=optimize_to_convergence_sharedslack(docs,label,totdoc,
				totwords,learn_parm,kernel_parm,
				kernel_cache,&shrink_state,model,
				a,lin,c,&timing_profile,
				&maxdiff);
			else
				iterations=optimize_to_convergence(docs,label,totdoc,
				totwords,learn_parm,kernel_parm,
				kernel_cache,&shrink_state,model,
				inconsistent,unlabeled,
				a,lin,c,&timing_profile,
				&maxdiff,(long)-1,(long)1);

			if(verbosity>=1) {
				if(verbosity==1) printf("done. (%ld iterations)\n",iterations);

				misclassified=0;
				for(i=0;(i<totdoc);i++) { /* get final statistic */
					if((lin[i]-model->b)*(double)label[i] <= 0.0) 
						misclassified++;
				}

				printf("Optimization finished (maxdiff=%.5f).\n",maxdiff); 

				runtime_end=get_runtime();
				if(verbosity>=2) {
					printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
						(runtime_end-runtime_start)/100.0,
						(100.0*timing_profile.time_kernel)/(runtime_end-runtime_start),
						(100.0*timing_profile.time_opti)/(runtime_end-runtime_start),
						(100.0*timing_profile.time_shrink)/(runtime_end-runtime_start),
						(100.0*timing_profile.time_update)/(runtime_end-runtime_start),
						(100.0*timing_profile.time_model)/(runtime_end-runtime_start),
						(100.0*timing_profile.time_check)/(runtime_end-runtime_start),
						(100.0*timing_profile.time_select)/(runtime_end-runtime_start));
				}
				else {
					printf("Runtime in cpu-seconds: %.2f\n",
						(runtime_end-runtime_start)/100.0);
				}
			}
			if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
				loss=0;
				model_length=0; 
				alphasum=0;
				for(i=0;i<totdoc;i++) {
					if((lin[i]-model->b)*(double)label[i] < c[i]-learn_parm->epsilon_crit)
						loss+=c[i]-(lin[i]-model->b)*(double)label[i];
					model_length+=a[i]*label[i]*lin[i];
					alphasum+=rhs[i]*a[i];
				}
				model_length=sqrt(model_length);
				fprintf(stdout,"Dual objective value: dval=%.5f\n",
					alphasum-0.5*model_length*model_length);
				fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length);
			}

			if(learn_parm->sharedslack) {
				index = (long *)my_malloc(sizeof(long)*totdoc);
				index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
				maxslackid=0;
				for(i=0;i<totdoc;i++) {    /* create full index */
					index[i]=1;
					if(maxslackid<docs[i]->slackid)
						maxslackid=docs[i]->slackid;
				}
				(void)compute_index(index,totdoc,index2dnum);
				slack=(double *)my_malloc(sizeof(double)*(maxslackid+1));
				alphaslack=(double *)my_malloc(sizeof(double)*(maxslackid+1));
				for(i=0;i<=maxslackid;i++) {    /* init shared slacks */
					slack[i]=0;
					alphaslack[i]=0;
				}
				for(i=0;i<totdoc;i++) {    /* compute alpha aggregated by slack */
					alphaslack[docs[i]->slackid]+=a[i];
				}
				compute_shared_slacks(docs,label,a,lin,c,index2dnum,learn_parm,
					slack,alphaslack);
				loss=0;
				model->at_upper_bound=0;
				svsetnum=0;
				for(i=0;i<=maxslackid;i++) {    /* create full index */
					loss+=slack[i];
					if(alphaslack[i] > (learn_parm->svm_c - learn_parm->epsilon_a)) 
						model->at_upper_bound++;
					if(alphaslack[i] > learn_parm->epsilon_a)
						svsetnum++;
				}
				free(index);
				free(index2dnum);
				free(slack);
				free(alphaslack);
			}

			if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
				if(learn_parm->sharedslack) {
					printf("Number of SV: %ld\n",
						model->sv_num-1);
					printf("Number of non-zero slack variables: %ld (%ld slacks have non-zero alpha)\n",
						model->at_upper_bound,svsetnum);
					fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
				}
				else {
					upsupvecnum=0;
					for(i=1;i<model->sv_num;i++) {
						if(fabs(model->alpha[i]) >= 
							(learn_parm->svm_cost[(model->supvec[i])->docnum]-
							learn_parm->epsilon_a)) 
							upsupvecnum++;
					}
					printf("Number of SV: %ld (including %ld at upper bound)\n",
						model->sv_num-1,upsupvecnum);
					fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
				}
				example_length=estimate_sphere(model); 
				fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n",
					length_of_longest_document_vector(docs,totdoc,kernel_parm));
			}
			if(verbosity>=1) {
				printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic);
			}

			if(alpha) {
				for(i=0;i<totdoc;i++) {    /* copy final alphas */
					alpha[i]=a[i];
				}
			}

			if(learn_parm->alphafile[0])
				write_alphas(learn_parm->alphafile,a,label,totdoc);

			shrink_state_cleanup(&shrink_state);
			free(label);
			free(unlabeled);
			free(inconsistent);
			free(c);
			free(a);
			free(lin);
			free(learn_parm->svm_cost);
}


long optimize_to_convergence(DOC **docs, long int *label, long int totdoc, 
							 long int totwords, LEARN_PARM *learn_parm, 
							 KERNEL_PARM *kernel_parm, 
							 KERNEL_CACHE *kernel_cache, 
							 SHRINK_STATE *shrink_state, MODEL *model, 
							 long int *inconsistent, long int *unlabeled, 
							 double *a, double *lin, double *c, 
							 TIMING *timing_profile, double *maxdiff, 
							 long int heldout, long int retrain)
							 /* docs: Training vectors (x-part) */
							 /* label: Training labels/value (y-part, zero if test example for
							 transduction) */
							 /* totdoc: Number of examples in docs/label */
							 /* totwords: Number of features (i.e. highest feature index) */
							 /* laern_parm: Learning paramenters */
							 /* kernel_parm: Kernel paramenters */
							 /* kernel_cache: Initialized/partly filled Cache, if using a kernel. 
							 NULL if linear. */
							 /* shrink_state: State of active variables */
							 /* model: Returns learning result */
							 /* inconsistent: examples thrown out as inconstistent */
							 /* unlabeled: test examples for transduction */
							 /* a: alphas */
							 /* lin: linear component of gradient */
							 /* c: right hand side of inequalities (margin) */
							 /* maxdiff: returns maximum violation of KT-conditions */
							 /* heldout: marks held-out example for leave-one-out (or -1) */
							 /* retrain: selects training mode (1=regular / 2=holdout) */
{
	long *chosen,*key,i,j,jj,*last_suboptimal_at,noshrink;
	long inconsistentnum,choosenum,already_chosen=0,iteration;
	long misclassified,supvecnum=0,*active2dnum,inactivenum;
	long *working2dnum,*selexam;
	long activenum;
	double criterion,eq;
	double *a_old;
	double t0=0,t1=0,t2=0,t3=0,t4=0,t5=0,t6=0; /* timing */
	long transductcycle;
	long transduction;
	double epsilon_crit_org; 
	double bestmaxdiff;
	long   bestmaxdiffiter,terminate;

	double *selcrit;  /* buffer for sorting */        
	CFLOAT *aicache;  /* buffer to keep one row of hessian */
	double *weights;  /* buffer for weight vector in linear case */
	QP qp;            /* buffer for one quadratic program */

	epsilon_crit_org=learn_parm->epsilon_crit; /* save org */
	if(kernel_parm->kernel_type == LINEAR) {
		learn_parm->epsilon_crit=2.0;
		/* kernel_cache=NULL; */  /* caching makes no sense for linear kernel */
	} 
	learn_parm->epsilon_shrink=2;
	(*maxdiff)=1;

	learn_parm->totwords=totwords;

	chosen = (long *)my_malloc(sizeof(long)*totdoc);
	last_suboptimal_at = (long *)my_malloc(sizeof(long)*totdoc);
	key = (long *)my_malloc(sizeof(long)*(totdoc+11)); 
	selcrit = (double *)my_malloc(sizeof(double)*totdoc);
	selexam = (long *)my_malloc(sizeof(long)*totdoc);
	a_old = (double *)my_malloc(sizeof(double)*totdoc);
	aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
	working2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
	active2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
	qp.opt_ce = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	qp.opt_ce0 = (double *)my_malloc(sizeof(double));
	qp.opt_g = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize
		*learn_parm->svm_maxqpsize);
	qp.opt_g0 = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	qp.opt_xinit = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	qp.opt_low=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	qp.opt_up=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	if(kernel_parm->kernel_type == LINEAR) {
		weights=create_nvector(totwords);
		clear_nvector(weights,totwords); /* set weights to zero */
	}
	else 
		weights=NULL;

	choosenum=0;
	inconsistentnum=0;
	transductcycle=0;
	transduction=0;
	if(!retrain) retrain=1;
	iteration=1;
	bestmaxdiffiter=1;
	bestmaxdiff=999999999;
	terminate=0;

	if(kernel_cache) {
		kernel_cache->time=iteration;  /* for lru cache */
		kernel_cache_reset_lru(kernel_cache);
	}

	for(i=0;i<totdoc;i++) {    /* various inits */
		chosen[i]=0;
		a_old[i]=a[i];
		last_suboptimal_at[i]=1;
		if(inconsistent[i]) 
			inconsistentnum++;
		if(unlabeled[i]) {
			transduction=1;
		}
	}
	activenum=compute_index(shrink_state->active,totdoc,active2dnum);
	inactivenum=totdoc-activenum;
	clear_index(working2dnum);

	/* repeat this loop until we have convergence */
	for(;retrain && (!terminate);iteration++) {

		if(kernel_cache)
			kernel_cache->time=iteration;  /* for lru cache */
		if(verbosity>=2) {
			printf(
				"Iteration %ld: ",iteration); fflush(stdout);
		}
		else if(verbosity==1) {
			printf("."); fflush(stdout);
		}

		if(verbosity>=2) t0=get_runtime();
		if(verbosity>=3) {
			printf("\nSelecting working set... "); fflush(stdout); 
		}

		if(learn_parm->svm_newvarsinqp>learn_parm->svm_maxqpsize) 
			learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;

		i=0;
		for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear working set */
			if((chosen[j]>=(learn_parm->svm_maxqpsize/
				minl(learn_parm->svm_maxqpsize,
				learn_parm->svm_newvarsinqp))) 
				|| (inconsistent[j])
				|| (j == heldout)) {
					chosen[j]=0; 
					choosenum--; 
			}
			else {
				chosen[j]++;
				working2dnum[i++]=j;
			}
		}
		working2dnum[i]=-1;

		if(retrain == 2) {
			choosenum=0;
			for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* fully clear working set */
				chosen[j]=0; 
			}
			clear_index(working2dnum);
			for(i=0;i<totdoc;i++) { /* set inconsistent examples to zero (-i 1) */
				if((inconsistent[i] || (heldout==i)) && (a[i] != 0.0)) {
					chosen[i]=99999;
					choosenum++;
					a[i]=0;
				}
			}
			if(learn_parm->biased_hyperplane) {
				eq=0;
				for(i=0;i<totdoc;i++) { /* make sure we fulfill equality constraint */
					eq+=a[i]*label[i];
				}
				for(i=0;(i<totdoc) && (fabs(eq) > learn_parm->epsilon_a);i++) {
					if((eq*label[i] > 0) && (a[i] > 0)) {
						chosen[i]=88888;
						choosenum++;
						if((eq*label[i]) > a[i]) {
							eq-=(a[i]*label[i]);
							a[i]=0;
						}
						else {
							a[i]-=(eq*label[i]);
							eq=0;
						}
					}
				}
			}
			compute_index(chosen,totdoc,working2dnum);
		}
		else {      /* select working set according to steepest gradient */
			if(iteration % 101) {
				already_chosen=0;
				if((minl(learn_parm->svm_newvarsinqp,
					learn_parm->svm_maxqpsize-choosenum)>=4) 
					&& (kernel_parm->kernel_type != LINEAR)) {
						/* select part of the working set from cache */
						already_chosen=select_next_qp_subproblem_grad(
							label,unlabeled,a,lin,c,totdoc,
							(long)(minl(learn_parm->svm_maxqpsize-choosenum,
							learn_parm->svm_newvarsinqp)
							/2),
							learn_parm,inconsistent,active2dnum,
							working2dnum,selcrit,selexam,kernel_cache,1,
							key,chosen);
						choosenum+=already_chosen;
				}
				choosenum+=select_next_qp_subproblem_grad(
					label,unlabeled,a,lin,c,totdoc,
					minl(learn_parm->svm_maxqpsize-choosenum,
					learn_parm->svm_newvarsinqp-already_chosen),
					learn_parm,inconsistent,active2dnum,
					working2dnum,selcrit,selexam,kernel_cache,0,key,
					chosen);
			}
			else { /* once in a while, select a somewhat random working set
				   to get unlocked of infinite loops due to numerical
				   inaccuracies in the core qp-solver */
				choosenum+=select_next_qp_subproblem_rand(
					label,unlabeled,a,lin,c,totdoc,
					minl(learn_parm->svm_maxqpsize-choosenum,
					learn_parm->svm_newvarsinqp),
					learn_parm,inconsistent,active2dnum,
					working2dnum,selcrit,selexam,kernel_cache,key,
					chosen,iteration);
			}
		}

		if(verbosity>=2) {
			printf(" %ld vectors chosen\n",choosenum); fflush(stdout); 
		}

		if(verbosity>=2) t1=get_runtime();

		if(kernel_cache) 
			cache_multiple_kernel_rows(kernel_cache,docs,working2dnum,
			choosenum,kernel_parm); 

		if(verbosity>=2) t2=get_runtime();
		if(retrain != 2) {
			optimize_svm(docs,label,unlabeled,inconsistent,0.0,chosen,active2dnum,
				model,totdoc,working2dnum,choosenum,a,lin,c,learn_parm,
				aicache,kernel_parm,&qp,&epsilon_crit_org);
		}

		if(verbosity>=2) t3=get_runtime();
		update_linear_component(docs,label,active2dnum,a,a_old,working2dnum,totdoc,
			totwords,kernel_parm,kernel_cache,lin,aicache,
			weights);

		if(verbosity>=2) t4=get_runtime();
		supvecnum=calculate_svm_model(docs,label,unlabeled,lin,a,a_old,c,
			learn_parm,working2dnum,active2dnum,model);

		if(verbosity>=2) t5=get_runtime();

		/* The following computation of the objective function works only */
		/* relative to the active variables */
		if(verbosity>=3) {
			criterion=compute_objective_function(a,lin,c,learn_parm->eps,label,
				active2dnum);
			printf("Objective function (over active variables): %.16f\n",criterion);
			fflush(stdout); 
		}

		for(jj=0;(i=working2dnum[jj])>=0;jj++) {
			a_old[i]=a[i];
		}

		if(retrain == 2) {  /* reset inconsistent unlabeled examples */
			for(i=0;(i<totdoc);i++) {
				if(inconsistent[i] && unlabeled[i]) {
					inconsistent[i]=0;
					label[i]=0;
				}
			}
		}

		retrain=check_optimality(model,label,unlabeled,a,lin,c,totdoc,learn_parm,
			maxdiff,epsilon_crit_org,&misclassified,
			inconsistent,active2dnum,last_suboptimal_at,
			iteration,kernel_parm);

		if(verbosity>=2) {
			t6=get_runtime();
			timing_profile->time_select+=t1-t0;
			timing_profile->time_kernel+=t2-t1;
			timing_profile->time_opti+=t3-t2;
			timing_profile->time_update+=t4-t3;
			timing_profile->time_model+=t5-t4;
			timing_profile->time_check+=t6-t5;
		}

		/* checking whether optimizer got stuck */
		if((*maxdiff) < bestmaxdiff) {
			bestmaxdiff=(*maxdiff);
			bestmaxdiffiter=iteration;
		}
		if(iteration > (bestmaxdiffiter+learn_parm->maxiter)) { 
			/* long time no progress? */
			terminate=1;
			retrain=0;
			if(verbosity>=1) 
				printf("\nWARNING: Relaxing KT-Conditions due to slow progress! Terminating!\n");
		}

		noshrink=0;
		if((!retrain) && (inactivenum>0) 
			&& ((!learn_parm->skip_final_opt_check) 
			|| (kernel_parm->kernel_type == LINEAR))) { 
				if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) 
					|| (verbosity>=2)) {
						if(verbosity==1) {
							printf("\n");
						}
						printf(" Checking optimality of inactive variables..."); 
						fflush(stdout);
				}
				t1=get_runtime();
				reactivate_inactive_examples(label,unlabeled,a,shrink_state,lin,c,totdoc,
					totwords,iteration,learn_parm,inconsistent,
					docs,kernel_parm,kernel_cache,model,aicache,
					weights,maxdiff);
				/* Update to new active variables. */
				activenum=compute_index(shrink_state->active,totdoc,active2dnum);
				inactivenum=totdoc-activenum;
				/* reset watchdog */
				bestmaxdiff=(*maxdiff);
				bestmaxdiffiter=iteration;
				/* termination criterion */
				noshrink=1;
				retrain=0;
				if((*maxdiff) > learn_parm->epsilon_crit) 
					retrain=1;
				timing_profile->time_shrink+=get_runtime()-t1;
				if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) 
					|| (verbosity>=2)) {
						printf("done.\n");  fflush(stdout);
						printf(" Number of inactive variables = %ld\n",inactivenum);
				}		  
		}

		if((!retrain) && (learn_parm->epsilon_crit>(*maxdiff))) 
			learn_parm->epsilon_crit=(*maxdiff);
		if((!retrain) && (learn_parm->epsilon_crit>epsilon_crit_org)) {
			learn_parm->epsilon_crit/=2.0;
			retrain=1;
			noshrink=1;
		}
		if(learn_parm->epsilon_crit<epsilon_crit_org) 
			learn_parm->epsilon_crit=epsilon_crit_org;

		if(verbosity>=2) {
			printf(" => (%ld SV (incl. %ld SV at u-bound), max violation=%.5f)\n",
				supvecnum,model->at_upper_bound,(*maxdiff)); 
			fflush(stdout);
		}
		if(verbosity>=3) {
			printf("\n");
		}

		if((!retrain) && (transduction)) {
			for(i=0;(i<totdoc);i++) {
				shrink_state->active[i]=1;
			}
			activenum=compute_index(shrink_state->active,totdoc,active2dnum);
			inactivenum=0;
			if(verbosity==1) printf("done\n");
			retrain=incorporate_unlabeled_examples(model,label,inconsistent,
				unlabeled,a,lin,totdoc,
				selcrit,selexam,key,
				transductcycle,kernel_parm,
				learn_parm);
			epsilon_crit_org=learn_parm->epsilon_crit;
			if(kernel_parm->kernel_type == LINEAR)
				learn_parm->epsilon_crit=1; 
			transductcycle++;
			/* reset watchdog */
			bestmaxdiff=(*maxdiff);
			bestmaxdiffiter=iteration;
		} 
		else if(((iteration % 10) == 0) && (!noshrink)) {
			activenum=shrink_problem(docs,learn_parm,shrink_state,kernel_parm,
				active2dnum,last_suboptimal_at,iteration,totdoc,
				maxl((long)(activenum/10),
				maxl((long)(totdoc/500),100)),
				a,inconsistent);
			inactivenum=totdoc-activenum;
			if((kernel_cache)
				&& (supvecnum>kernel_cache->max_elems)
				&& ((kernel_cache->activenum-activenum)>maxl((long)(activenum/10),500))) {
					kernel_cache_shrink(kernel_cache,totdoc,
						minl((kernel_cache->activenum-activenum),
						(kernel_cache->activenum-supvecnum)),
						shrink_state->active); 
			}
		}

		if((!retrain) && learn_parm->remove_inconsistent) {
			if(verbosity>=1) {
				printf(" Moving training errors to inconsistent examples...");
				fflush(stdout);
			}
			if(learn_parm->remove_inconsistent == 1) {
				retrain=identify_inconsistent(a,label,unlabeled,totdoc,learn_parm,
					&inconsistentnum,inconsistent); 
			}
			else if(learn_parm->remove_inconsistent == 2) {
				retrain=identify_misclassified(lin,label,unlabeled,totdoc,
					model,&inconsistentnum,inconsistent); 
			}
			else if(learn_parm->remove_inconsistent == 3) {
				retrain=identify_one_misclassified(lin,label,unlabeled,totdoc,
					model,&inconsistentnum,inconsistent);
			}
			if(retrain) {
				if(kernel_parm->kernel_type == LINEAR) { /* reinit shrinking */
					learn_parm->epsilon_crit=2.0;
				} 
			}
			if(verbosity>=1) {
				printf("done.\n");
				if(retrain) {
					printf(" Now %ld inconsistent examples.\n",inconsistentnum);
				}
			}
		}
	} /* end of loop */

	free(chosen);
	free(last_suboptimal_at);
	free(key);
	free(selcrit);
	free(selexam);
	free(a_old);
	free(aicache);
	free(working2dnum);
	free(active2dnum);
	free(qp.opt_ce);
	free(qp.opt_ce0);
	free(qp.opt_g);
	free(qp.opt_g0);
	free(qp.opt_xinit);
	free(qp.opt_low);
	free(qp.opt_up);
	if(weights) free(weights);

	learn_parm->epsilon_crit=epsilon_crit_org; /* restore org */
	model->maxdiff=(*maxdiff);

	return(iteration);
}

long optimize_to_convergence_sharedslack(DOC **docs, long int *label, 
										 long int totdoc, 
										 long int totwords, LEARN_PARM *learn_parm, 
										 KERNEL_PARM *kernel_parm, 
										 KERNEL_CACHE *kernel_cache, 
										 SHRINK_STATE *shrink_state, MODEL *model, 
										 double *a, double *lin, double *c, 
										 TIMING *timing_profile, double *maxdiff)
										 /* docs: Training vectors (x-part) */
										 /* label: Training labels/value (y-part, zero if test example for
										 transduction) */
										 /* totdoc: Number of examples in docs/label */
										 /* totwords: Number of features (i.e. highest feature index) */
										 /* learn_parm: Learning paramenters */
										 /* kernel_parm: Kernel paramenters */
										 /* kernel_cache: Initialized/partly filled Cache, if using a kernel. 
										 NULL if linear. */
										 /* shrink_state: State of active variables */
										 /* model: Returns learning result */
										 /* a: alphas */
										 /* lin: linear component of gradient */
										 /* c: right hand side of inequalities (margin) */
										 /* maxdiff: returns maximum violation of KT-conditions */
{
	long *chosen,*key,i,j,jj,*last_suboptimal_at,noshrink,*unlabeled;
	long *inconsistent,choosenum,already_chosen=0,iteration;
	long misclassified,supvecnum=0,*active2dnum,inactivenum;
	long *working2dnum,*selexam,*ignore;
	long activenum,retrain,maxslackid,slackset,jointstep;
	double criterion,eq_target;
	double *a_old,*alphaslack;
	double t0=0,t1=0,t2=0,t3=0,t4=0,t5=0,t6=0; /* timing */
	double epsilon_crit_org,maxsharedviol; 
	double bestmaxdiff;
	long   bestmaxdiffiter,terminate;

	double *selcrit;  /* buffer for sorting */        
	CFLOAT *aicache;  /* buffer to keep one row of hessian */
	double *weights;  /* buffer for weight vector in linear case */
	QP qp;            /* buffer for one quadratic program */
	double *slack;    /* vector of slack variables for optimization with
					  shared slacks */

	epsilon_crit_org=learn_parm->epsilon_crit; /* save org */
	if(kernel_parm->kernel_type == LINEAR) {
		learn_parm->epsilon_crit=2.0;
		/* kernel_cache=NULL; */  /* caching makes no sense for linear kernel */
	} 
	learn_parm->epsilon_shrink=2;
	(*maxdiff)=1;

	learn_parm->totwords=totwords;

	chosen = (long *)my_malloc(sizeof(long)*totdoc);
	unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
	inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
	ignore = (long *)my_malloc(sizeof(long)*totdoc);
	last_suboptimal_at = (long *)my_malloc(sizeof(long)*totdoc);
	key = (long *)my_malloc(sizeof(long)*(totdoc+11)); 
	selcrit = (double *)my_malloc(sizeof(double)*totdoc);
	selexam = (long *)my_malloc(sizeof(long)*totdoc);
	a_old = (double *)my_malloc(sizeof(double)*totdoc);
	aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
	working2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
	active2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
	qp.opt_ce = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	qp.opt_ce0 = (double *)my_malloc(sizeof(double));
	qp.opt_g = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize
		*learn_parm->svm_maxqpsize);
	qp.opt_g0 = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	qp.opt_xinit = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	qp.opt_low=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	qp.opt_up=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
	if(kernel_parm->kernel_type == LINEAR) {
		weights=create_nvector(totwords);
		clear_nvector(weights,totwords); /* set weights to zero */
	}
	else 
		weights=NULL;
	maxslackid=0;
	for(i=0;i<totdoc;i++) {    /* determine size of slack array */
		if(maxslackid<docs[i]->slackid)
			maxslackid=docs[i]->slackid;
	}
	slack=(double *)my_malloc(sizeof(double)*(maxslackid+1));
	alphaslack=(double *)my_malloc(sizeof(double)*(maxslackid+1));
	for(i=0;i<=maxslackid;i++) {    /* init shared slacks */
		slack[i]=0;
		alphaslack[i]=0;
	}

	choosenum=0;
	retrain=1;
	iteration=1;
	bestmaxdiffiter=1;
	bestmaxdiff=999999999;
	terminate=0;

	if(kernel_cache) {
		kernel_cache->time=iteration;  /* for lru cache */
		kernel_cache_reset_lru(kernel_cache);
	}

	for(i=0;i<totdoc;i++) {    /* various inits */
		chosen[i]=0;
		unlabeled[i]=0;
		inconsistent[i]=0;
		ignore[i]=0;
		alphaslack[docs[i]->slackid]+=a[i];
		a_old[i]=a[i];
		last_suboptimal_at[i]=1;
	}
	activenum=compute_index(shrink_state->active,totdoc,active2dnum);
	inactivenum=totdoc-activenum;
	clear_index(working2dnum);

	/* call to init slack and alphaslack */
	compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm,
		slack,alphaslack);

	/* repeat this loop until we have convergence */
	reset_hideo_globals();
	for(;retrain && (!terminate);iteration++) {

		if(kernel_cache)
			kernel_cache->time=iteration;  /* for lru cache */
		if(verbosity>=2) {
			printf(
				"Iteration %ld: ",iteration); fflush(stdout);
		}
		else if(verbosity==1) {
			printf("."); fflush(stdout);
		}

		if(verbosity>=2) t0=get_runtime();
		if(verbosity>=3) {
			printf("\nSelecting working set... "); fflush(stdout); 
		}

		if(learn_parm->svm_newvarsinqp>learn_parm->svm_maxqpsize) 
			learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;

		/* select working set according to steepest gradient */
		jointstep=0;
		eq_target=0;
		if(iteration % 101) {
			slackset=select_next_qp_slackset(docs,label,a,lin,slack,alphaslack,c,
				learn_parm,active2dnum,&maxsharedviol);
			if((!(iteration % 100))
				|| (!slackset) || (maxsharedviol<learn_parm->epsilon_crit)){
					/* do a step with examples from different slack sets */
					if(verbosity >= 2) {
						printf("(i-step)"); fflush(stdout);
					}
					i=0;
					for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear old part of working set */
						if((chosen[j]>=(learn_parm->svm_maxqpsize/
							minl(learn_parm->svm_maxqpsize,
							learn_parm->svm_newvarsinqp)))) {
								chosen[j]=0; 
								choosenum--; 
						}
						else {
							chosen[j]++;
							working2dnum[i++]=j;
						}
					}
					working2dnum[i]=-1;

					already_chosen=0;
					if((minl(learn_parm->svm_newvarsinqp,
						learn_parm->svm_maxqpsize-choosenum)>=4) 
						&& (kernel_parm->kernel_type != LINEAR)) {
							/* select part of the working set from cache */
							already_chosen=select_next_qp_subproblem_grad(
								label,unlabeled,a,lin,c,totdoc,
								(long)(minl(learn_parm->svm_maxqpsize-choosenum,
								learn_parm->svm_newvarsinqp)
								/2),
								learn_parm,inconsistent,active2dnum,
								working2dnum,selcrit,selexam,kernel_cache,
								(long)1,key,chosen);
							choosenum+=already_chosen;
					}
					choosenum+=select_next_qp_subproblem_grad(
						label,unlabeled,a,lin,c,totdoc,
						minl(learn_parm->svm_maxqpsize-choosenum,
						learn_parm->svm_newvarsinqp-already_chosen),
						learn_parm,inconsistent,active2dnum,
						working2dnum,selcrit,selexam,kernel_cache,
						(long)0,key,chosen);
			}
			else { /* do a step with all examples from same slack set */
				if(verbosity >= 2) {
					printf("(j-step on %ld)",slackset); fflush(stdout);
				}
				jointstep=1;
				for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear working set */
					chosen[j]=0; 
				}
				working2dnum[0]=-1;
				eq_target=alphaslack[slackset];
				for(j=0;j<totdoc;j++) {                  /* mask all but slackset */
					/* for(jj=0;(j=active2dnum[jj])>=0;jj++) { */
					if(docs[j]->slackid != slackset)
						ignore[j]=1; 
					else {
						ignore[j]=0; 
						learn_parm->svm_cost[j]=learn_parm->svm_c;
						/* printf("Inslackset(%ld,%ld)",j,shrink_state->active[j]); */
					}
				}
				learn_parm->biased_hyperplane=1;
				choosenum=select_next_qp_subproblem_grad(
					label,unlabeled,a,lin,c,totdoc,
					learn_parm->svm_maxqpsize,
					learn_parm,ignore,active2dnum,
					working2dnum,selcrit,selexam,kernel_cache,
					(long)0,key,chosen);
				learn_parm->biased_hyperplane=0;
			}
		}
		else { /* once in a while, select a somewhat random working set
			   to get unlocked of infinite loops due to numerical
			   inaccuracies in the core qp-solver */
			choosenum+=select_next_qp_subproblem_rand(
				label,unlabeled,a,lin,c,totdoc,
				minl(learn_parm->svm_maxqpsize-choosenum,
				learn_parm->svm_newvarsinqp),
				learn_parm,inconsistent,active2dnum,
				working2dnum,selcrit,selexam,kernel_cache,key,
				chosen,iteration);
		}

		if(verbosity>=2) {
			printf(" %ld vectors chosen\n",choosenum); fflush(stdout); 
		}

		if(verbosity>=2) t1=get_runtime();

		if(kernel_cache) 
			cache_multiple_kernel_rows(kernel_cache,docs,working2dnum,
			choosenum,kernel_parm); 

		if(verbosity>=2) t2=get_runtime();
		if(jointstep) learn_parm->biased_hyperplane=1;
		optimize_svm(docs,label,unlabeled,ignore,eq_target,chosen,active2dnum,
			model,totdoc,working2dnum,choosenum,a,lin,c,learn_parm,
			aicache,kernel_parm,&qp,&epsilon_crit_org);
		learn_parm->biased_hyperplane=0;

		for(jj=0;(i=working2dnum[jj])>=0;jj++)   /* recompute sums of alphas */
			alphaslack[docs[i]->slackid]+=(a[i]-a_old[i]);
		for(jj=0;(i=working2dnum[jj])>=0;jj++) { /* reduce alpha to fulfill
												 constraints */
			if(alphaslack[docs[i]->slackid] > learn_parm->svm_c) {
				if(a[i] < (alphaslack[docs[i]->slackid]-learn_parm->svm_c)) {
					alphaslack[docs[i]->slackid]-=a[i];
					a[i]=0;
				}
				else {
					a[i]-=(alphaslack[docs[i]->slackid]-learn_parm->svm_c);
					alphaslack[docs[i]->slackid]=learn_parm->svm_c;
				}
			}
		}
		for(jj=0;(i=active2dnum[jj])>=0;jj++) 
			learn_parm->svm_cost[i]=a[i]+(learn_parm->svm_c
			-alphaslack[docs[i]->slackid]);
		model->at_upper_bound=0;
		for(jj=0;jj<=maxslackid;jj++) {
			if(alphaslack[jj]>(learn_parm->svm_c-learn_parm->epsilon_a)) 
				model->at_upper_bound++;
		}

		if(verbosity>=2) t3=get_runtime();
		update_linear_component(docs,label,active2dnum,a,a_old,working2dnum,totdoc,
			totwords,kernel_parm,kernel_cache,lin,aicache,
			weights);
		compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm,
			slack,alphaslack);

		if(verbosity>=2) t4=get_runtime();
		supvecnum=calculate_svm_model(docs,label,unlabeled,lin,a,a_old,c,
			learn_parm,working2dnum,active2dnum,model);

		if(verbosity>=2) t5=get_runtime();

		/* The following computation of the objective function works only */
		/* relative to the active variables */
		if(verbosity>=3) {
			criterion=compute_objective_function(a,lin,c,learn_parm->eps,label,
				active2dnum);
			printf("Objective function (over active variables): %.16f\n",criterion);
			fflush(stdout); 
		}

		for(jj=0;(i=working2dnum[jj])>=0;jj++) {
			a_old[i]=a[i];
		}

		retrain=check_optimality_sharedslack(docs,model,label,a,lin,c,
			slack,alphaslack,totdoc,learn_parm,
			maxdiff,epsilon_crit_org,&misclassified,
			active2dnum,last_suboptimal_at,
			iteration,kernel_parm);

		if(verbosity>=2) {
			t6=get_runtime();
			timing_profile->time_select+=t1-t0;
			timing_profile->time_kernel+=t2-t1;
			timing_profile->time_opti+=t3-t2;
			timing_profile->time_update+=t4-t3;
			timing_profile->time_model+=t5-t4;
			timing_profile->time_check+=t6-t5;
		}

		/* checking whether optimizer got stuck */
		if((*maxdiff) < bestmaxdiff) {
			bestmaxdiff=(*maxdiff);
			bestmaxdiffiter=iteration;
		}
		if(iteration > (bestmaxdiffiter+learn_parm->maxiter)) { 
			/* long time no progress? */
			terminate=1;
			retrain=0;
			if(verbosity>=1) 
				printf("\nWARNING: Relaxing KT-Conditions due to slow progress! Terminating!\n");
		}

		noshrink=0; 

		if((!retrain) && (inactivenum>0) 
			&& ((!learn_parm->skip_final_opt_check) 
			|| (kernel_parm->kernel_type == LINEAR))) { 
				if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) 
					|| (verbosity>=2)) {
						if(verbosity==1) {
							printf("\n");
						}
						printf(" Checking optimality of inactive variables..."); 
						fflush(stdout);
				}
				t1=get_runtime();
				reactivate_inactive_examples(label,unlabeled,a,shrink_state,lin,c,totdoc,
					totwords,iteration,learn_parm,inconsistent,
					docs,kernel_parm,kernel_cache,model,aicache,
					weights,maxdiff);
				/* Update to new active variables. */
				activenum=compute_index(shrink_state->active,totdoc,active2dnum);
				inactivenum=totdoc-activenum;
				/* check optimality, since check in reactivate does not work for
				sharedslacks */
				compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm,
					slack,alphaslack);
				retrain=check_optimality_sharedslack(docs,model,label,a,lin,c,
					slack,alphaslack,totdoc,learn_parm,
					maxdiff,epsilon_crit_org,&misclassified,
					active2dnum,last_suboptimal_at,
					iteration,kernel_parm);

				/* reset watchdog */
				bestmaxdiff=(*maxdiff);
				bestmaxdiffiter=iteration;
				/* termination criterion */
				noshrink=1;
				retrain=0;
				if((*maxdiff) > learn_parm->epsilon_crit) 
					retrain=1;
				timing_profile->time_shrink+=get_runtime()-t1;
				if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) 
					|| (verbosity>=2)) {
						printf("done.\n");  fflush(stdout);
						printf(" Number of inactive variables = %ld\n",inactivenum);
				}		  
		}

		if((!retrain) && (learn_parm->epsilon_crit>(*maxdiff))) 
			learn_parm->epsilon_crit=(*maxdiff);
		if((!retrain) && (learn_parm->epsilon_crit>epsilon_crit_org)) {
			learn_parm->epsilon_crit/=2.0;
			retrain=1;
			noshrink=1;
		}
		if(learn_parm->epsilon_crit<epsilon_crit_org) 
			learn_parm->epsilon_crit=epsilon_crit_org;

		if(verbosity>=2) {
			printf(" => (%ld SV (incl. %ld SV at u-bound), max violation=%.5f)\n",
				supvecnum,model->at_upper_bound,(*maxdiff)); 
			fflush(stdout);
		}
		if(verbosity>=3) {
			printf("\n");
		}

		if(((iteration % 10) == 0) && (!noshrink)) {
			activenum=shrink_problem(docs,learn_parm,shrink_state,
				kernel_parm,active2dnum,
				last_suboptimal_at,iteration,totdoc,
				maxl((long)(activenum/10),
				maxl((long)(totdoc/500),100)),
				a,inconsistent);
			inactivenum=totdoc-activenum;
			if((kernel_cache)
				&& (supvecnum>kernel_cache->max_elems)
				&& ((kernel_cache->activenum-activenum)>maxl((long)(activenum/10),500))) {
					kernel_cache_shrink(kernel_cache,totdoc,
						minl((kernel_cache->activenum-activenum),
						(kernel_cache->activenum-supvecnum)),
						shrink_state->active); 
			}
		}

	} /* end of loop */


	free(alphaslack);
	free(slack);
	free(chosen);
	free(unlabeled);
	free(inconsistent);
	free(ignore);
	free(last_suboptimal_at);
	free(key);
	free(selcrit);
	free(selexam);
	free(a_old);
	free(aicache);
	free(working2dnum);
	free(active2dnum);
	free(qp.opt_ce);
	free(qp.opt_ce0);
	free(qp.opt_g);
	free(qp.opt_g0);
	free(qp.opt_xinit);
	free(qp.opt_low);
	free(qp.opt_up);
	if(weights) free(weights);

	learn_parm->epsilon_crit=epsilon_crit_org; /* restore org */
	model->maxdiff=(*maxdiff);

	return(iteration);
}


double compute_objective_function(double *a, double *lin, double *c, 
								  double eps, long int *label, 
								  long int *active2dnum)
								  /* Return value of objective function. */
								  /* Works only relative to the active variables! */
{
	long i,ii;
	double criterion;
	/* calculate value of objective function */
	criterion=0;
	for(ii=0;active2dnum[ii]>=0;ii++) {
		i=active2dnum[ii];
		criterion=criterion+(eps-(double)label[i]*c[i])*a[i]+0.5*a[i]*label[i]*lin[i];
	} 
	return(criterion);
}

void clear_index(long int *index)
/* initializes and empties index */
{
	index[0]=-1;
} 

void add_to_index(long int *index, long int elem)
/* initializes and empties index */
{
	register long i;
	for(i=0;index[i] != -1;i++);
	index[i]=elem;
	index[i+1]=-1;
}

long compute_index(long int *binfeature, long int range, long int *index)
/* create an inverted index of binfeature */
{               
	register long i,ii;

	ii=0;
	for(i=0;i<range;i++) {
		if(binfeature[i]) {
			index[ii]=i;
			ii++;
		}
	}
	for(i=0;i<4;i++) {
		index[ii+i]=-1;
	}
	return(ii);
}


void optimize_svm(DOC **docs, long int *label, long int *unlabeled, 
				  long int *exclude_from_eq_const, double eq_target,
				  long int *chosen, long int *active2dnum, MODEL *model, 
				  long int totdoc, long int *working2dnum, long int varnum, 
				  double *a, double *lin, double *c, LEARN_PARM *learn_parm, 
				  CFLOAT *aicache, KERNEL_PARM *kernel_parm, QP *qp, 
				  double *epsilon_crit_target)
				  /* Do optimization on the working set. */
{
	long i;
	double *a_v;

	compute_matrices_for_optimization(docs,label,unlabeled,
		exclude_from_eq_const,eq_target,chosen,
		active2dnum,working2dnum,model,a,lin,c,
		varnum,totdoc,learn_parm,aicache,
		kernel_parm,qp);

	if(verbosity>=3) {
		printf("Running optimizer..."); fflush(stdout);
	}
	/* call the qp-subsolver */
	a_v=optimize_qp(qp,epsilon_crit_target,
		learn_parm->svm_maxqpsize,
		&(model->b),   /* in case the optimizer gives us */
		/* the threshold for free. otherwise */
		/* b is calculated in calculate_model. */
		learn_parm);
	if(verbosity>=3) {         
		printf("done\n");
	}

	for(i=0;i<varnum;i++) {
		a[working2dnum[i]]=a_v[i];
		/*
		if(a_v[i]<=(0+learn_parm->epsilon_a)) {
		a[working2dnum[i]]=0;
		}
		else if(a_v[i]>=(learn_parm->svm_cost[working2dnum[i]]-learn_parm->epsilon_a)) {
		a[working2dnum[i]]=learn_parm->svm_cost[working2dnum[i]];
		}
		*/
	}
}

void compute_matrices_for_optimization(DOC **docs, long int *label, 
									   long int *unlabeled, long *exclude_from_eq_const, double eq_target,
									   long int *chosen, long int *active2dnum, 
									   long int *key, MODEL *model, double *a, double *lin, double *c, 
									   long int varnum, long int totdoc, LEARN_PARM *learn_parm, 
									   CFLOAT *aicache, KERNEL_PARM *kernel_parm, QP *qp)
{
	register long ki,kj,i,j;
	register double kernel_temp;

	if(verbosity>=3) {
		fprintf(stdout,"Computing qp-matrices (type %ld kernel [degree %ld, rbf_gamma %f, coef_lin %f, coef_const %f])...",kernel_parm->kernel_type,kernel_parm->poly_degree,kernel_parm->rbf_gamma,kernel_parm->coef_lin,kernel_parm->coef_const); 
		fflush(stdout);
	}

	qp->opt_n=varnum;
	qp->opt_ce0[0]=-eq_target; /* compute the constant for equality constraint */
	for(j=1;j<model->sv_num;j++) { /* start at 1 */
		if((!chosen[(model->supvec[j])->docnum])
			&& (!exclude_from_eq_const[(model->supvec[j])->docnum])) {
				qp->opt_ce0[0]+=model->alpha[j];
		}
	} 
	if(learn_parm->biased_hyperplane) 
		qp->opt_m=1;
	else 
		qp->opt_m=0;  /* eq-constraint will be ignored */

	/* init linear part of objective function */
	for(i=0;i<varnum;i++) {
		qp->opt_g0[i]=lin[key[i]];
	}

	for(i=0;i<varnum;i++) {
		ki=key[i];

		/* Compute the matrix for equality constraints */
		qp->opt_ce[i]=label[ki];
		qp->opt_low[i]=0;
		qp->opt_up[i]=learn_parm->svm_cost[ki];

		kernel_temp=(double)kernel(kernel_parm,docs[ki],docs[ki]); 
		/* compute linear part of objective function */
		qp->opt_g0[i]-=(kernel_temp*a[ki]*(double)label[ki]); 
		/* compute quadratic part of objective function */
		qp->opt_g[varnum*i+i]=kernel_temp;
		for(j=i+1;j<varnum;j++) {
			kj=key[j];
			kernel_temp=(double)kernel(kernel_parm,docs[ki],docs[kj]);
			/* compute linear part of objective function */
			qp->opt_g0[i]-=(kernel_temp*a[kj]*(double)label[kj]);
			qp->opt_g0[j]-=(kernel_temp*a[ki]*(double)label[ki]); 
			/* compute quadratic part of objective function */
			qp->opt_g[varnum*i+j]=(double)label[ki]*(double)label[kj]*kernel_temp;
			qp->opt_g[varnum*j+i]=(double)label[ki]*(double)label[kj]*kernel_temp;
		}

		if(verbosity>=3) {
			if(i % 20 == 0) {
				fprintf(stdout,"%ld..",i); fflush(stdout);
			}
		}
	}

	for(i=0;i<varnum;i++) {
		/* assure starting at feasible point */
		qp->opt_xinit[i]=a[key[i]];
		/* set linear part of objective function */
		qp->opt_g0[i]=(learn_parm->eps-(double)label[key[i]]*c[key[i]])+qp->opt_g0[i]*(double)label[key[i]];    
	}

	if(verbosity>=3) {
		fprintf(stdout,"done\n");
	}
}

long calculate_svm_model(DOC **docs, long int *label, long int *unlabeled, 
						 double *lin, double *a, double *a_old, double *c, 
						 LEARN_PARM *learn_parm, long int *working2dnum, 
						 long int *active2dnum, MODEL *model)
						 /* Compute decision function based on current values */
						 /* of alpha. */
{
	long i,ii,pos,b_calculated=0,first_low,first_high;
	double ex_c,b_temp,b_low,b_high;

	if(verbosity>=3) {
		printf("Calculating model..."); fflush(stdout);
	}

	if(!learn_parm->biased_hyperplane) {
		model->b=0;
		b_calculated=1;
	}

	for(ii=0;(i=working2dnum[ii])>=0;ii++) {
		if((a_old[i]>0) && (a[i]==0)) { /* remove from model */
			pos=model->index[i]; 
			model->index[i]=-1;
			(model->sv_num)--;
			model->supvec[pos]=model->supvec[model->sv_num];
			model->alpha[pos]=model->alpha[model->sv_num];
			model->index[(model->supvec[pos])->docnum]=pos;
		}
		else if((a_old[i]==0) && (a[i]>0)) { /* add to model */
			model->supvec[model->sv_num]=docs[i];
			model->alpha[model->sv_num]=a[i]*(double)label[i];
			model->index[i]=model->sv_num;
			(model->sv_num)++;
		}
		else if(a_old[i]==a[i]) { /* nothing to do */
		}
		else {  /* just update alpha */
			model->alpha[model->index[i]]=a[i]*(double)label[i];
		}

		ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a;
		if(!learn_parm->sharedslack) {
			if((a_old[i]>=ex_c) && (a[i]<ex_c)) { 
				(model->at_upper_bound)--;
			}
			else if((a_old[i]<ex_c) && (a[i]>=ex_c)) { 
				(model->at_upper_bound)++;
			}
		}

		if((!b_calculated) 
			&& (a[i]>learn_parm->epsilon_a) && (a[i]<ex_c)) {   /* calculate b */
				model->b=((double)label[i]*learn_parm->eps-c[i]+lin[i]); 
				/* model->b=(-(double)label[i]+lin[i]); */
				b_calculated=1;
		}
	}      

	/* No alpha in the working set not at bounds, so b was not
	calculated in the usual way. The following handles this special
	case. */
	if(learn_parm->biased_hyperplane 
		&& (!b_calculated)
		&& (model->sv_num-1 == model->at_upper_bound)) { 
			first_low=1;
			first_high=1;
			b_low=0;
			b_high=0;
			for(ii=0;(i=active2dnum[ii])>=0;ii++) {
				ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a;
				if(a[i]<ex_c) { 
					if(label[i]>0)  {
						b_temp=-(learn_parm->eps-c[i]+lin[i]);
						if((b_temp>b_low) || (first_low)) {
							b_low=b_temp;
							first_low=0;
						}
					}
					else {
						b_temp=-(-learn_parm->eps-c[i]+lin[i]);
						if((b_temp<b_high) || (first_high)) {
							b_high=b_temp;
							first_high=0;
						}
					}
				}
				else {
					if(label[i]<0)  {
						b_temp=-(-learn_parm->eps-c[i]+lin[i]);
						if((b_temp>b_low) || (first_low)) {
							b_low=b_temp;
							first_low=0;
						}
					}
					else {
						b_temp=-(learn_parm->eps-c[i]+lin[i]);
						if((b_temp<b_high) || (first_high)) {
							b_high=b_temp;
							first_high=0;
						}
					}
				}
			}
			if(first_high) {
				model->b=-b_low;
			}
			else if(first_low) {
				model->b=-b_high;
			}
			else {
				model->b=-(b_high+b_low)/2.0;  /* select b as the middle of range */
				/* printf("\nb_low=%f, b_high=%f,b=%f\n",b_low,b_high,model->b); */
			}
	}

	if(verbosity>=3) {
		printf("done\n"); fflush(stdout);
	}

	return(model->sv_num-1); /* have to substract one, since element 0 is empty*/
}

long check_optimality(MODEL *model, long int *label, long int *unlabeled, 
					  double *a, double *lin, double *c, long int totdoc, 
					  LEARN_PARM *learn_parm, double *maxdiff, 
					  double epsilon_crit_org, long int *misclassified, 
					  long int *inconsistent, long int *active2dnum,
					  long int *last_suboptimal_at, 
					  long int iteration, KERNEL_PARM *kernel_parm)
					  /* Check KT-conditions */
{
	long i,ii,retrain;
	double dist,ex_c,target;

	if(kernel_parm->kernel_type == LINEAR) {  /* be optimistic */
		learn_parm->epsilon_shrink=-learn_parm->epsilon_crit+epsilon_crit_org;  
	}
	else {  /* be conservative */
		learn_parm->epsilon_shrink=learn_parm->epsilon_shrink*0.7+(*maxdiff)*0.3; 
	}
	retrain=0;
	(*maxdiff)=0;
	(*misclassified)=0;
	for(ii=0;(i=active2dnum[ii])>=0;ii++) {
		if((!inconsistent[i]) && label[i]) {
			dist=(lin[i]-model->b)*(double)label[i];/* 'distance' from
													hyperplane*/
			target=-(learn_parm->eps-(double)label[i]*c[i]);
			ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a;
			if(dist <= 0) {       
				(*misclassified)++;  /* does not work due to deactivation of var */
			}
			if((a[i]>learn_parm->epsilon_a) && (dist > target)) {
				if((dist-target)>(*maxdiff))  /* largest violation */
					(*maxdiff)=dist-target;
			}
			else if((a[i]<ex_c) && (dist < target)) {
				if((target-dist)>(*maxdiff))  /* largest violation */
					(*maxdiff)=target-dist;
			}
			/* Count how long a variable was at lower/upper bound (and optimal).*/
			/* Variables, which were at the bound and optimal for a long */
			/* time are unlikely to become support vectors. In case our */
			/* cache is filled up, those variables are excluded to save */
			/* kernel evaluations. (See chapter 'Shrinking').*/ 
			if((a[i]>(learn_parm->epsilon_a)) 
				&& (a[i]<ex_c)) { 
					last_suboptimal_at[i]=iteration;         /* not at bound */
			}
			else if((a[i]<=(learn_parm->epsilon_a)) 
				&& (dist < (target+learn_parm->epsilon_shrink))) {
					last_suboptimal_at[i]=iteration;         /* not likely optimal */
			}
			else if((a[i]>=ex_c)
				&& (dist > (target-learn_parm->epsilon_shrink)))  { 
					last_suboptimal_at[i]=iteration;         /* not likely optimal */
			}
		}   
	}
	/* termination criterion */
	if((!retrain) && ((*maxdiff) > learn_parm->epsilon_crit)) {  
		retrain=1;
	}
	return(retrain);
}

long check_optimality_sharedslack(DOC **docs, MODEL *model, long int *label,
								  double *a, double *lin, double *c, double *slack,
								  double *alphaslack,
								  long int totdoc, 
								  LEARN_PARM *learn_parm, double *maxdiff, 
								  double epsilon_crit_org, long int *misclassified, 
								  long int *active2dnum,
								  long int *last_suboptimal_at, 
								  long int iteration, KERNEL_PARM *kernel_parm)
								  /* Check KT-conditions */
{
	long i,ii,retrain;
	double dist,dist_noslack,ex_c=0,target;

	if(kernel_parm->kernel_type == LINEAR) {  /* be optimistic */
		learn_parm->epsilon_shrink=-learn_parm->epsilon_crit/2.0;
	}
	else {  /* be conservative */
		learn_parm->epsilon_shrink=learn_parm->epsilon_shrink*0.7+(*maxdiff)*0.3; 
	}

	retrain=0;
	(*maxdiff)=0;
	(*misclassified)=0;
	for(ii=0;(i=active2dnum[ii])>=0;ii++) {
		/* 'distance' from hyperplane*/
		dist_noslack=(lin[i]-model->b)*(double)label[i];
		dist=dist_noslack+slack[docs[i]->slackid];
		target=-(learn_parm->eps-(double)label[i]*c[i]);
		ex_c=learn_parm->svm_c-learn_parm->epsilon_a;
		if((a[i]>learn_parm->epsilon_a) && (dist > target)) {
			if((dist-target)>(*maxdiff)) {  /* largest violation */
				(*maxdiff)=dist-target;
				if(verbosity>=5) printf("sid %ld: dist=%.2f, target=%.2f, slack=%.2f, a=%f, alphaslack=%f\n",docs[i]->slackid,dist,target,slack[docs[i]->slackid],a[i],alphaslack[docs[i]->slackid]);
				if(verbosity>=5) printf(" (single %f)\n",(*maxdiff));
			}
		}
		if((alphaslack[docs[i]->slackid]<ex_c) && (slack[docs[i]->slackid]>0)) {
			if((slack[docs[i]->slackid])>(*maxdiff)) { /* largest violation */
				(*maxdiff)=slack[docs[i]->slackid];
				if(verbosity>=5) printf("sid %ld: dist=%.2f, target=%.2f, slack=%.2f, a=%f, alphaslack=%f\n",docs[i]->slackid,dist,target,slack[docs[i]->slackid],a[i],alphaslack[docs[i]->slackid]);
				if(verbosity>=5) printf(" (joint %f)\n",(*maxdiff));
			}
		}
		/* Count how long a variable was at lower/upper bound (and optimal).*/
		/* Variables, which were at the bound and optimal for a long */
		/* time are unlikely to become support vectors. In case our */
		/* cache is filled up, those variables are excluded to save */
		/* kernel evaluations. (See chapter 'Shrinking').*/
		if((a[i]<=learn_parm->epsilon_a) && (dist < (target+learn_parm->epsilon_shrink))) {
			last_suboptimal_at[i]=iteration;  /* not likely optimal */
		}
		else if((alphaslack[docs[i]->slackid]<ex_c) && (a[i]>learn_parm->epsilon_a) && (fabs(dist_noslack - target) > -learn_parm->epsilon_shrink)) { 
			last_suboptimal_at[i]=iteration;  /* not at lower bound */
		}
		else if((alphaslack[docs[i]->slackid]>=ex_c) && (a[i]>learn_parm->epsilon_a) && (fabs(target-dist) > -learn_parm->epsilon_shrink)) {
			last_suboptimal_at[i]=iteration;  /* not likely optimal */
		}
	}
	/* termination criterion */
	if((!retrain) && ((*maxdiff) > learn_parm->epsilon_crit)) {  
		retrain=1;
	}
	return(retrain);
}

void compute_shared_slacks(DOC **docs, long int *label, 
						   double *a, double *lin, 
						   double *c, long int *active2dnum,
						   LEARN_PARM *learn_parm, 
						   double *slack, double *alphaslack)
						   /* compute the value of shared slacks and the joint alphas */
{
	long jj,i;
	double dist,target;

	for(jj=0;(i=active2dnum[jj])>=0;jj++) { /* clear slack variables */
		slack[docs[i]->slackid]=0.0;
		/*    alphaslack[docs[i]->slackid]=0.0; */
	}
	for(jj=0;(i=active2dnum[jj])>=0;jj++) { /* recompute slack variables */
		dist=(lin[i])*(double)label[i];
		target=-(learn_parm->eps-(double)label[i]*c[i]);
		if((target-dist) > slack[docs[i]->slackid])
			slack[docs[i]->slackid]=target-dist;
		/*    alphaslack[docs[i]->slackid]+=a[i]; */
	}
}


long identify_inconsistent(double *a, long int *label, 
						   long int *unlabeled, long int totdoc, 
						   LEARN_PARM *learn_parm, 
						   long int *inconsistentnum, long int *inconsistent)
{
	long i,retrain;

	/* Throw out examples with multipliers at upper bound. This */
	/* corresponds to the -i 1 option. */
	/* ATTENTION: this is just a heuristic for finding a close */
	/*            to minimum number of examples to exclude to */
	/*            make the problem separable with desired margin */
	retrain=0;
	for(i=0;i<totdoc;i++) {
		if((!inconsistent[i]) && (!unlabeled[i]) 
			&& (a[i]>=(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) { 
				(*inconsistentnum)++;
				inconsistent[i]=1;  /* never choose again */
				retrain=2;          /* start over */
				if(verbosity>=3) {
					printf("inconsistent(%ld)..",i); fflush(stdout);
				}
		}
	}
	return(retrain);
}

long identify_misclassified(double *lin, long int *label, 
							long int *unlabeled, long int totdoc, 
							MODEL *model, long int *inconsistentnum, 
							long int *inconsistent)
{
	long i,retrain;
	double dist;

	/* Throw out misclassified examples. This */
	/* corresponds to the -i 2 option. */
	/* ATTENTION: this is just a heuristic for finding a close */
	/*            to minimum number of examples to exclude to */
	/*            make the problem separable with desired margin */
	retrain=0;
	for(i=0;i<totdoc;i++) {
		dist=(lin[i]-model->b)*(double)label[i]; /* 'distance' from hyperplane*/  
		if((!inconsistent[i]) && (!unlabeled[i]) && (dist <= 0)) { 
			(*inconsistentnum)++;
			inconsistent[i]=1;  /* never choose again */
			retrain=2;          /* start over */
			if(verbosity>=3) {
				printf("inconsistent(%ld)..",i); fflush(stdout);
			}
		}
	}
	return(retrain);
}

long identify_one_misclassified(double *lin, long int *label, 
								long int *unlabeled, 
								long int totdoc, MODEL *model, 
								long int *inconsistentnum, 
								long int *inconsistent)
{
	long i,retrain,maxex=-1;
	double dist,maxdist=0;

	/* Throw out the 'most misclassified' example. This */
	/* corresponds to the -i 3 option. */
	/* ATTENTION: this is just a heuristic for finding a close */
	/*            to minimum number of examples to exclude to */
	/*            make the problem separable with desired margin */
	retrain=0;
	for(i=0;i<totdoc;i++) {
		if((!inconsistent[i]) && (!unlabeled[i])) {
			dist=(lin[i]-model->b)*(double)label[i];/* 'distance' from hyperplane*/  
			if(dist<maxdist) {
				maxdist=dist;
				maxex=i;
			}
		}
	}
	if(maxex>=0) {
		(*inconsistentnum)++;
		inconsistent[maxex]=1;  /* never choose again */
		retrain=2;          /* start over */
		if(verbosity>=3) {
			printf("inconsistent(%ld)..",i); fflush(stdout);
		}
	}
	return(retrain);
}

void update_linear_component(DOC **docs, long int *label, 
							 long int *active2dnum, double *a, 
							 double *a_old, long int *working2dnum, 
							 long int totdoc, long int totwords, 
							 KERNEL_PARM *kernel_parm, 
							 KERNEL_CACHE *kernel_cache, 
							 double *lin, CFLOAT *aicache, double *weights)
							 /* keep track of the linear component */
							 /* lin of the gradient etc. by updating */
							 /* based on the change of the variables */
							 /* in the current working set */
							 /* WARNING: Assumes that array of weights is initialized to all zero 
							 values for linear kernel! */
{
	register long i,ii,j,jj;
	register double tec;
	SVECTOR *f;

	if(kernel_parm->kernel_type==0) { /* special linear case */
		/* clear_vector_n(weights,totwords); */
		for(ii=0;(i=working2dnum[ii])>=0;ii++) {
			if(a[i] != a_old[i]) {
				for(f=docs[i]->fvec;f;f=f->next)  
					add_vector_ns(weights,f,
					f->factor*((a[i]-a_old[i])*(double)label[i]));
			}
		}
		for(jj=0;(j=active2dnum[jj])>=0;jj++) {
			for(f=docs[j]->fvec;f;f=f->next)  
				lin[j]+=f->factor*sprod_ns(weights,f);
		}
		for(ii=0;(i=working2dnum[ii])>=0;ii++) {
			if(a[i] != a_old[i]) {
				for(f=docs[i]->fvec;f;f=f->next)  
					mult_vector_ns(weights,f,0.0);  /* Set weights back to zero. */
			}                                   /* This is faster than init */
		}                                     /* weights to zero in each iter. */
	}
	else {                            /* general case */
		for(jj=0;(i=working2dnum[jj])>=0;jj++) {
			if(a[i] != a_old[i]) {
				get_kernel_row(kernel_cache,docs,i,totdoc,active2dnum,aicache,
					kernel_parm);
				for(ii=0;(j=active2dnum[ii])>=0;ii++) {
					tec=aicache[j];
					lin[j]+=(((a[i]*tec)-(a_old[i]*tec))*(double)label[i]);
				}
			}
		}
	}
}


long incorporate_unlabeled_examples(MODEL *model, long int *label, 
									long int *inconsistent, 
									long int *unlabeled, 
									double *a, double *lin, 
									long int totdoc, double *selcrit, 
									long int *select, long int *key, 
									long int transductcycle, 
									KERNEL_PARM *kernel_parm, 
									LEARN_PARM *learn_parm)
{
	long i,j,k,j1,j2,j3,j4,unsupaddnum1=0,unsupaddnum2=0;
	long pos,neg,upos,uneg,orgpos,orgneg,nolabel,newpos,newneg,allunlab;
	double dist,model_length,posratio,negratio;
	long check_every=2;
	double loss;
	static double switchsens=0.0,switchsensorg=0.0;
	double umin,umax,sumalpha;
	long imin=0,imax=0;
	static long switchnum=0;

	switchsens/=1.2;

	/* assumes that lin[] is up to date -> no inactive vars */

	orgpos=0;
	orgneg=0;
	newpos=0;
	newneg=0;
	nolabel=0;
	allunlab=0;
	for(i=0;i<totdoc;i++) {
		if(!unlabeled[i]) {
			if(label[i] > 0) {
				orgpos++;
			}
			else {
				orgneg++;
			}
		}
		else {
			allunlab++;
			if(unlabeled[i]) {
				if(label[i] > 0) {
					newpos++;
				}
				else if(label[i] < 0) {
					newneg++;
				}
			}
		}
		if(label[i]==0) {
			nolabel++;
		}
	}

	if(learn_parm->transduction_posratio >= 0) {
		posratio=learn_parm->transduction_posratio;
	}
	else {
		posratio=(double)orgpos/(double)(orgpos+orgneg); /* use ratio of pos/neg */
	}                                                  /* in training data */
	negratio=1.0-posratio;

	learn_parm->svm_costratio=1.0;                     /* global */
	if(posratio>0) {
		learn_parm->svm_costratio_unlab=negratio/posratio;
	}
	else {
		learn_parm->svm_costratio_unlab=1.0;
	}

	pos=0;
	neg=0;
	upos=0;
	uneg=0;
	for(i=0;i<totdoc;i++) {
		dist=(lin[i]-model->b);  /* 'distance' from hyperplane*/
		if(dist>0) {
			pos++;
		}
		else {
			neg++;
		}
		if(unlabeled[i]) {
			if(dist>0) {
				upos++;
			}
			else {
				uneg++;
			}
		}
		if((!unlabeled[i]) && (a[i]>(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) {
			/*      printf("Ubounded %ld (class %ld, unlabeled %ld)\n",i,label[i],unlabeled[i]); */
		}
	}
	if(verbosity>=2) {
		printf("POS=%ld, ORGPOS=%ld, ORGNEG=%ld\n",pos,orgpos,orgneg);
		printf("POS=%ld, NEWPOS=%ld, NEWNEG=%ld\n",pos,newpos,newneg);
		printf("pos ratio = %f (%f).\n",(double)(upos)/(double)(allunlab),posratio);
		fflush(stdout);
	}

	if(transductcycle == 0) {
		j1=0; 
		j2=0;
		j4=0;
		for(i=0;i<totdoc;i++) {
			dist=(lin[i]-model->b);  /* 'distance' from hyperplane*/
			if((label[i]==0) && (unlabeled[i])) {
				selcrit[j4]=dist;
				key[j4]=i;
				j4++;
			}
		}
		unsupaddnum1=0;	
		unsupaddnum2=0;	
		select_top_n(selcrit,j4,select,(long)(allunlab*posratio+0.5));
		for(k=0;(k<(long)(allunlab*posratio+0.5));k++) {
			i=key[select[k]];
			label[i]=1;
			unsupaddnum1++;	
			j1++;
		}
		for(i=0;i<totdoc;i++) {
			if((label[i]==0) && (unlabeled[i])) {
				label[i]=-1;
				j2++;
				unsupaddnum2++;
			}
		}
		for(i=0;i<totdoc;i++) {  /* set upper bounds on vars */
			if(unlabeled[i]) {
				if(label[i] == 1) {
					learn_parm->svm_cost[i]=learn_parm->svm_c*
						learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound;
				}
				else if(label[i] == -1) {
					learn_parm->svm_cost[i]=learn_parm->svm_c*
						learn_parm->svm_unlabbound;
				}
			}
		}
		if(verbosity>=1) {
			/* printf("costratio %f, costratio_unlab %f, unlabbound %f\n",
			learn_parm->svm_costratio,learn_parm->svm_costratio_unlab,
			learn_parm->svm_unlabbound); */
			printf("Classifying unlabeled data as %ld POS / %ld NEG.\n",
				unsupaddnum1,unsupaddnum2); 
			fflush(stdout);
		}
		if(verbosity >= 1) 
			printf("Retraining.");
		if(verbosity >= 2) printf("\n");
		return((long)3);
	}
	if((transductcycle % check_every) == 0) {
		if(verbosity >= 1) 
			printf("Retraining.");
		if(verbosity >= 2) printf("\n");
		j1=0;
		j2=0;
		unsupaddnum1=0;
		unsupaddnum2=0;
		for(i=0;i<totdoc;i++) {
			if((unlabeled[i] == 2)) {
				unlabeled[i]=1;
				label[i]=1;
				j1++;
				unsupaddnum1++;
			}
			else if((unlabeled[i] == 3)) {
				unlabeled[i]=1;
				label[i]=-1;
				j2++;
				unsupaddnum2++;
			}
		}
		for(i=0;i<totdoc;i++) {  /* set upper bounds on vars */
			if(unlabeled[i]) {
				if(label[i] == 1) {
					learn_parm->svm_cost[i]=learn_parm->svm_c*
						learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound;
				}
				else if(label[i] == -1) {
					learn_parm->svm_cost[i]=learn_parm->svm_c*
						learn_parm->svm_unlabbound;
				}
			}
		}

		if(verbosity>=2) {
			/* printf("costratio %f, costratio_unlab %f, unlabbound %f\n",
			learn_parm->svm_costratio,learn_parm->svm_costratio_unlab,
			learn_parm->svm_unlabbound); */
			printf("%ld positive -> Added %ld POS / %ld NEG unlabeled examples.\n",
				upos,unsupaddnum1,unsupaddnum2); 
			fflush(stdout);
		}

		if(learn_parm->svm_unlabbound == 1) {
			learn_parm->epsilon_crit=0.001; /* do the last run right */
		}
		else {
			learn_parm->epsilon_crit=0.01; /* otherwise, no need to be so picky */
		}

		return((long)3);
	}
	else if(((transductcycle % check_every) < check_every)) { 
		model_length=0;
		sumalpha=0;
		loss=0;
		for(i=0;i<totdoc;i++) {
			model_length+=a[i]*label[i]*lin[i];
			sumalpha+=a[i];
			dist=(lin[i]-model->b);  /* 'distance' from hyperplane*/
			if((label[i]*dist)<(1.0-learn_parm->epsilon_crit)) {
				loss+=(1.0-(label[i]*dist))*learn_parm->svm_cost[i]; 
			}
		}
		model_length=sqrt(model_length); 
		if(verbosity>=2) {
			printf("Model-length = %f (%f), loss = %f, objective = %f\n",
				model_length,sumalpha,loss,loss+0.5*model_length*model_length);
			fflush(stdout);
		}
		j1=0;
		j2=0;
		j3=0;
		j4=0;
		unsupaddnum1=0;	
		unsupaddnum2=0;	
		umin=99999;
		umax=-99999;
		j4=1;
		while(j4) {
			umin=99999;
			umax=-99999;
			for(i=0;(i<totdoc);i++) { 
				dist=(lin[i]-model->b);  
				if((label[i]>0) && (unlabeled[i]) && (!inconsistent[i]) 
					&& (dist<umin)) {
						umin=dist;
						imin=i;
				}
				if((label[i]<0) && (unlabeled[i])  && (!inconsistent[i]) 
					&& (dist>umax)) {
						umax=dist;
						imax=i;
				}
			}
			if((umin < (umax+switchsens-1E-4))) {
				j1++;
				j2++;
				unsupaddnum1++;	
				unlabeled[imin]=3;
				inconsistent[imin]=1;
				unsupaddnum2++;	
				unlabeled[imax]=2;
				inconsistent[imax]=1;
			}
			else
				j4=0;
			j4=0;
		}
		for(j=0;(j<totdoc);j++) {
			if(unlabeled[j] && (!inconsistent[j])) {
				if(label[j]>0) {
					unlabeled[j]=2;
				}
				else if(label[j]<0) {
					unlabeled[j]=3;
				}
				/* inconsistent[j]=1; */
				j3++;
			}
		}
		switchnum+=unsupaddnum1+unsupaddnum2;

		/* stop and print out current margin
		printf("switchnum %ld %ld\n",switchnum,kernel_parm->poly_degree);
		if(switchnum == 2*kernel_parm->poly_degree) {
		learn_parm->svm_unlabbound=1;
		}
		*/

		if((!unsupaddnum1) && (!unsupaddnum2)) {
			if((learn_parm->svm_unlabbound>=1) && ((newpos+newneg) == allunlab)) {
				for(j=0;(j<totdoc);j++) {
					inconsistent[j]=0;
					if(unlabeled[j]) unlabeled[j]=1;
				}
				write_prediction(learn_parm->predfile,model,lin,a,unlabeled,label,
					totdoc,learn_parm);  
				if(verbosity>=1)
					printf("Number of switches: %ld\n",switchnum);
				return((long)0);
			}
			switchsens=switchsensorg;
			learn_parm->svm_unlabbound*=1.5;
			if(learn_parm->svm_unlabbound>1) {
				learn_parm->svm_unlabbound=1;
			}
			model->at_upper_bound=0; /* since upper bound increased */
			if(verbosity>=1) 
				printf("Increasing influence of unlabeled examples to %f%% .",
				learn_parm->svm_unlabbound*100.0);
		}
		else if(verbosity>=1) {
			printf("%ld positive -> Switching labels of %ld POS / %ld NEG unlabeled examples.",
				upos,unsupaddnum1,unsupaddnum2); 
			fflush(stdout);
		}

		if(verbosity >= 2) printf("\n");

		learn_parm->epsilon_crit=0.5; /* don't need to be so picky */

		for(i=0;i<totdoc;i++) {  /* set upper bounds on vars */
			if(unlabeled[i]) {
				if(label[i] == 1) {
					learn_parm->svm_cost[i]=learn_parm->svm_c*
						learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound;
				}
				else if(label[i] == -1) {
					learn_parm->svm_cost[i]=learn_parm->svm_c*
						learn_parm->svm_unlabbound;
				}
			}
		}

		return((long)2);
	}

	return((long)0); 
}

/*************************** Working set selection ***************************/

long select_next_qp_subproblem_grad(long int *label, 
									long int *unlabeled, 
									double *a, double *lin, 
									double *c, long int totdoc, 
									long int qp_size, 
									LEARN_PARM *learn_parm, 
									long int *inconsistent, 
									long int *active2dnum, 
									long int *working2dnum, 
									double *selcrit, 
									long int *select, 
									KERNEL_CACHE *kernel_cache, 
									long int cache_only,
									long int *key, long int *chosen)
									/* Use the feasible direction approach to select the next
									qp-subproblem (see chapter 'Selecting a good working set'). If
									'cache_only' is true, then the variables are selected only among
									those for which the kernel evaluations are cached. */
{
	long choosenum,i,j,k,activedoc,inum,valid;
	double s;

	for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */
	choosenum=0;
	activedoc=0;
	for(i=0;(j=active2dnum[i])>=0;i++) {
		s=-label[j];
		if(kernel_cache && cache_only) 
			valid=(kernel_cache->index[j]>=0);
		else
			valid=1;
		if(valid
			&& (!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0)))
			&& (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) 
			&& (s>0)))
			&& (!chosen[j]) 
			&& (label[j])
			&& (!inconsistent[j]))
		{
			selcrit[activedoc]=(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]);
			/*      selcrit[activedoc]=(double)label[j]*(-1.0+(double)label[j]*lin[j]); */
			key[activedoc]=j;
			activedoc++;
		}
	}
	select_top_n(selcrit,activedoc,select,(long)(qp_size/2));
	for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) {
		/* if(learn_parm->biased_hyperplane || (selcrit[select[k]] > 0)) { */
		i=key[select[k]];
		chosen[i]=1;
		working2dnum[inum+choosenum]=i;
		choosenum+=1;
		if(kernel_cache)
			kernel_cache_touch(kernel_cache,i); /* make sure it does not get
												kicked out of cache */
		/* } */
	}

	activedoc=0;
	for(i=0;(j=active2dnum[i])>=0;i++) {
		s=label[j];
		if(kernel_cache && cache_only) 
			valid=(kernel_cache->index[j]>=0);
		else
			valid=1;
		if(valid
			&& (!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0)))
			&& (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) 
			&& (s>0))) 
			&& (!chosen[j]) 
			&& (label[j])
			&& (!inconsistent[j])) 
		{
			selcrit[activedoc]=-(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]);
			/*  selcrit[activedoc]=-(double)(label[j]*(-1.0+(double)label[j]*lin[j])); */
			key[activedoc]=j;
			activedoc++;
		}
	}
	select_top_n(selcrit,activedoc,select,(long)(qp_size/2));
	for(k=0;(choosenum<qp_size) && (k<(qp_size/2)) && (k<activedoc);k++) {
		/* if(learn_parm->biased_hyperplane || (selcrit[select[k]] > 0)) { */
		i=key[select[k]];
		chosen[i]=1;
		working2dnum[inum+choosenum]=i;
		choosenum+=1;
		if(kernel_cache)
			kernel_cache_touch(kernel_cache,i); /* make sure it does not get
												kicked out of cache */
		/* } */
	} 
	working2dnum[inum+choosenum]=-1; /* complete index */
	return(choosenum);
}

long select_next_qp_subproblem_rand(long int *label, 
									long int *unlabeled, 
									double *a, double *lin, 
									double *c, long int totdoc, 
									long int qp_size, 
									LEARN_PARM *learn_parm, 
									long int *inconsistent, 
									long int *active2dnum, 
									long int *working2dnum, 
									double *selcrit, 
									long int *select, 
									KERNEL_CACHE *kernel_cache, 
									long int *key, 
									long int *chosen, 
									long int iteration)
									/* Use the feasible direction approach to select the next
									qp-subproblem (see section 'Selecting a good working set'). Chooses
									a feasible direction at (pseudo) random to help jump over numerical
									problem. */
{
	long choosenum,i,j,k,activedoc,inum;
	double s;

	for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */
	choosenum=0;
	activedoc=0;
	for(i=0;(j=active2dnum[i])>=0;i++) {
		s=-label[j];
		if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0)))
			&& (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) 
			&& (s>0)))
			&& (!inconsistent[j]) 
			&& (label[j])
			&& (!chosen[j])) {
				selcrit[activedoc]=(j+iteration) % totdoc;
				key[activedoc]=j;
				activedoc++;
		}
	}
	select_top_n(selcrit,activedoc,select,(long)(qp_size/2));
	for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) {
		i=key[select[k]];
		chosen[i]=1;
		working2dnum[inum+choosenum]=i;
		choosenum+=1;
		kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */
		/* out of cache */
	}

	activedoc=0;
	for(i=0;(j=active2dnum[i])>=0;i++) {
		s=label[j];
		if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0)))
			&& (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) 
			&& (s>0))) 
			&& (!inconsistent[j]) 
			&& (label[j])
			&& (!chosen[j])) {
				selcrit[activedoc]=(j+iteration) % totdoc;
				key[activedoc]=j;
				activedoc++;
		}
	}
	select_top_n(selcrit,activedoc,select,(long)(qp_size/2));
	for(k=0;(choosenum<qp_size) && (k<(qp_size/2)) && (k<activedoc);k++) {
		i=key[select[k]];
		chosen[i]=1;
		working2dnum[inum+choosenum]=i;
		choosenum+=1;
		kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */
		/* out of cache */
	} 
	working2dnum[inum+choosenum]=-1; /* complete index */
	return(choosenum);
}

long select_next_qp_slackset(DOC **docs, long int *label, 
							 double *a, double *lin, 
							 double *slack, double *alphaslack, 
							 double *c,
							 LEARN_PARM *learn_parm, 
							 long int *active2dnum, double *maxviol)
							 /* returns the slackset with the largest internal violation */
{
	long i,ii,maxdiffid;
	double dist,target,maxdiff,ex_c;

	maxdiff=0;
	maxdiffid=0;
	for(ii=0;(i=active2dnum[ii])>=0;ii++) {
		ex_c=learn_parm->svm_c-learn_parm->epsilon_a;
		if(alphaslack[docs[i]->slackid] >= ex_c) {
			dist=(lin[i])*(double)label[i]+slack[docs[i]->slackid]; /* distance */
			target=-(learn_parm->eps-(double)label[i]*c[i]); /* rhs of constraint */
			if((a[i]>learn_parm->epsilon_a) && (dist > target)) {
				if((dist-target)>maxdiff) { /* largest violation */
					maxdiff=dist-target;
					maxdiffid=docs[i]->slackid;
				}
			}
		}
	}
	(*maxviol)=maxdiff;
	return(maxdiffid);
}


void select_top_n(double *selcrit, long int range, long int *select, 
				  long int n)
{
	register long i,j;

	for(i=0;(i<n) && (i<range);i++) { /* Initialize with the first n elements */
		for(j=i;j>=0;j--) {
			if((j>0) && (selcrit[select[j-1]]<selcrit[i])){
				select[j]=select[j-1];
			}
			else {
				select[j]=i;
				j=-1;
			}
		}
	}
	if(n>0) {
		for(i=n;i<range;i++) {  
			if(selcrit[i]>selcrit[select[n-1]]) {
				for(j=n-1;j>=0;j--) {
					if((j>0) && (selcrit[select[j-1]]<selcrit[i])) {
						select[j]=select[j-1];
					}
					else {
						select[j]=i;
						j=-1;
					}
				}
			}
		}
	}
}      


/******************************** Shrinking  *********************************/

void init_shrink_state(SHRINK_STATE *shrink_state, long int totdoc, 
					   long int maxhistory)
{
	long i;

	shrink_state->deactnum=0;
	shrink_state->active = (long *)my_malloc(sizeof(long)*totdoc);
	shrink_state->inactive_since = (long *)my_malloc(sizeof(long)*totdoc);
	shrink_state->a_history = (double **)my_malloc(sizeof(double *)*maxhistory);
	shrink_state->maxhistory=maxhistory;
	shrink_state->last_lin = (double *)my_malloc(sizeof(double)*totdoc);
	shrink_state->last_a = (double *)my_malloc(sizeof(double)*totdoc);

	for(i=0;i<totdoc;i++) { 
		shrink_state->active[i]=1;
		shrink_state->inactive_since[i]=0;
		shrink_state->last_a[i]=0;
		shrink_state->last_lin[i]=0;
	}
}

void shrink_state_cleanup(SHRINK_STATE *shrink_state)
{
	free(shrink_state->active);
	free(shrink_state->inactive_since);
	if(shrink_state->deactnum > 0) 
		free(shrink_state->a_history[shrink_state->deactnum-1]);
	free(shrink_state->a_history);
	free(shrink_state->last_a);
	free(shrink_state->last_lin);
}

long shrink_problem(DOC **docs,
					LEARN_PARM *learn_parm, 
					SHRINK_STATE *shrink_state, 
					KERNEL_PARM *kernel_parm,
					long int *active2dnum, 
					long int *last_suboptimal_at, 
					long int iteration, 
					long int totdoc, 
					long int minshrink, 
					double *a, 
					long int *inconsistent)
					/* Shrink some variables away.  Do the shrinking only if at least
					minshrink variables can be removed. */
{
	long i,ii,change,activenum,lastiter;
	double *a_old;

	activenum=0;
	change=0;
	for(ii=0;active2dnum[ii]>=0;ii++) {
		i=active2dnum[ii];
		activenum++;
		if(0 && learn_parm->sharedslack)
			lastiter=last_suboptimal_at[docs[i]->slackid];
		else
			lastiter=last_suboptimal_at[i];
		if(((iteration-lastiter) > learn_parm->svm_iter_to_shrink) 
			|| (inconsistent[i])) {
				change++;
		}
	}
	if((change>=minshrink) /* shrink only if sufficiently many candidates */
		&& (shrink_state->deactnum<shrink_state->maxhistory)) { /* and enough memory */
			/* Shrink problem by removing those variables which are */
			/* optimal at a bound for a minimum number of iterations */
			if(verbosity>=2) {
				printf(" Shrinking..."); fflush(stdout);
			}
			if(kernel_parm->kernel_type != LINEAR) { /*  non-linear case save alphas */
				a_old=(double *)my_malloc(sizeof(double)*totdoc);
				shrink_state->a_history[shrink_state->deactnum]=a_old;
				for(i=0;i<totdoc;i++) {
					a_old[i]=a[i];
				}
			}
			for(ii=0;active2dnum[ii]>=0;ii++) {
				i=active2dnum[ii];
				if(0 && learn_parm->sharedslack)
					lastiter=last_suboptimal_at[docs[i]->slackid];
				else
					lastiter=last_suboptimal_at[i];
				if(((iteration-lastiter) > learn_parm->svm_iter_to_shrink) 
					|| (inconsistent[i])) {
						shrink_state->active[i]=0;
						shrink_state->inactive_since[i]=shrink_state->deactnum;
				}
			}
			activenum=compute_index(shrink_state->active,totdoc,active2dnum);
			shrink_state->deactnum++;
			if(kernel_parm->kernel_type == LINEAR) { 
				shrink_state->deactnum=0;
			}
			if(verbosity>=2) {
				printf("done.\n"); fflush(stdout);
				printf(" Number of inactive variables = %ld\n",totdoc-activenum);
			}
	}
	return(activenum);
} 


void reactivate_inactive_examples(long int *label, 
								  long int *unlabeled, 
								  double *a, 
								  SHRINK_STATE *shrink_state, 
								  double *lin, 
								  double *c, 
								  long int totdoc, 
								  long int totwords, 
								  long int iteration, 
								  LEARN_PARM *learn_parm, 
								  long int *inconsistent, 
								  DOC **docs, 
								  KERNEL_PARM *kernel_parm, 
								  KERNEL_CACHE *kernel_cache, 
								  MODEL *model, 
								  CFLOAT *aicache, 
								  double *weights, 
								  double *maxdiff)
								  /* Make all variables active again which had been removed by
								  shrinking. */
								  /* Computes lin for those variables from scratch. */
								  /* WARNING: Assumes that array of weights is initialized to all zero 
								  values for linear kernel! */
{
	register long i,j,ii,jj,t,*changed2dnum,*inactive2dnum;
	long *changed,*inactive;
	register double kernel_val,*a_old,dist;
	double ex_c,target;
	SVECTOR *f;

	if(kernel_parm->kernel_type == LINEAR) { /* special linear case */
		/* clear_vector_n(weights,totwords);  set weights to zero */
		a_old=shrink_state->last_a;    
		for(i=0;i<totdoc;i++) {
			if(a[i] != a_old[i]) {
				for(f=docs[i]->fvec;f;f=f->next)  
					add_vector_ns(weights,f,
					f->factor*((a[i]-a_old[i])*(double)label[i]));
				a_old[i]=a[i];
			}
		}
		for(i=0;i<totdoc;i++) {
			if(!shrink_state->active[i]) {
				for(f=docs[i]->fvec;f;f=f->next)  
					lin[i]=shrink_state->last_lin[i]+f->factor*sprod_ns(weights,f);
			}
			shrink_state->last_lin[i]=lin[i];
		}
		for(i=0;i<totdoc;i++) {
			for(f=docs[i]->fvec;f;f=f->next)  
				mult_vector_ns(weights,f,0.0); /* set weights back to zero */
		}
	}
	else {
		changed=(long *)my_malloc(sizeof(long)*totdoc);
		changed2dnum=(long *)my_malloc(sizeof(long)*(totdoc+11));
		inactive=(long *)my_malloc(sizeof(long)*totdoc);
		inactive2dnum=(long *)my_malloc(sizeof(long)*(totdoc+11));
		for(t=shrink_state->deactnum-1;(t>=0) && shrink_state->a_history[t];t--) {
			if(verbosity>=2) {
				printf("%ld..",t); fflush(stdout);
			}
			a_old=shrink_state->a_history[t];    
			for(i=0;i<totdoc;i++) {
				inactive[i]=((!shrink_state->active[i]) 
					&& (shrink_state->inactive_since[i] == t));
				changed[i]= (a[i] != a_old[i]);
			}
			compute_index(inactive,totdoc,inactive2dnum);
			compute_index(changed,totdoc,changed2dnum);

			for(ii=0;(i=changed2dnum[ii])>=0;ii++) {
				get_kernel_row(kernel_cache,docs,i,totdoc,inactive2dnum,aicache,
					kernel_parm);
				for(jj=0;(j=inactive2dnum[jj])>=0;jj++) {
					kernel_val=aicache[j];
					lin[j]+=(((a[i]*kernel_val)-(a_old[i]*kernel_val))*(double)label[i]);
				}
			}
		}
		free(changed);
		free(changed2dnum);
		free(inactive);
		free(inactive2dnum);
	}
	(*maxdiff)=0;
	for(i=0;i<totdoc;i++) {
		shrink_state->inactive_since[i]=shrink_state->deactnum-1;
		if(!inconsistent[i]) {
			dist=(lin[i]-model->b)*(double)label[i];
			target=-(learn_parm->eps-(double)label[i]*c[i]);
			ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a;
			if((a[i]>learn_parm->epsilon_a) && (dist > target)) {
				if((dist-target)>(*maxdiff))  /* largest violation */
					(*maxdiff)=dist-target;
			}
			else if((a[i]<ex_c) && (dist < target)) {
				if((target-dist)>(*maxdiff))  /* largest violation */
					(*maxdiff)=target-dist;
			}
			if((a[i]>(0+learn_parm->epsilon_a)) 
				&& (a[i]<ex_c)) { 
					shrink_state->active[i]=1;                         /* not at bound */
			}
			else if((a[i]<=(0+learn_parm->epsilon_a)) && (dist < (target+learn_parm->epsilon_shrink))) {
				shrink_state->active[i]=1;
			}
			else if((a[i]>=ex_c)
				&& (dist > (target-learn_parm->epsilon_shrink))) {
					shrink_state->active[i]=1;
			}
			else if(learn_parm->sharedslack) { /* make all active when sharedslack */
				shrink_state->active[i]=1;
			}
		}
	}
	if(kernel_parm->kernel_type != LINEAR) { /* update history for non-linear */
		for(i=0;i<totdoc;i++) {
			(shrink_state->a_history[shrink_state->deactnum-1])[i]=a[i];
		}
		for(t=shrink_state->deactnum-2;(t>=0) && shrink_state->a_history[t];t--) {
			free(shrink_state->a_history[t]);
			shrink_state->a_history[t]=0;
		}
	}
}

/****************************** Cache handling *******************************/

void get_kernel_row(KERNEL_CACHE *kernel_cache, DOC **docs, 
					long int docnum, long int totdoc, 
					long int *active2dnum, CFLOAT *buffer, 
					KERNEL_PARM *kernel_parm)
					/* Get's a row of the matrix of kernel values This matrix has the
					same form as the Hessian, just that the elements are not
					multiplied by */
					/* y_i * y_j * a_i * a_j */
					/* Takes the values from the cache if available. */
{
	register long i,j,start;
	DOC *ex;

	ex=docs[docnum];

	if(kernel_cache && (kernel_cache->index[docnum] != -1)) {/* row is cached? */
		kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time;/* lru */
		start=kernel_cache->activenum*kernel_cache->index[docnum];
		for(i=0;(j=active2dnum[i])>=0;i++) {
			if(kernel_cache->totdoc2active[j] >= 0) { /* column is cached? */
				buffer[j]=kernel_cache->buffer[start+kernel_cache->totdoc2active[j]];
			}
			else {
				buffer[j]=(CFLOAT)kernel(kernel_parm,ex,docs[j]);
			}
		}
	}
	else {
		for(i=0;(j=active2dnum[i])>=0;i++) {
			buffer[j]=(CFLOAT)kernel(kernel_parm,ex,docs[j]);
		}
	}
}


void cache_kernel_row(KERNEL_CACHE *kernel_cache, DOC **docs, 
					  long int m, KERNEL_PARM *kernel_parm)
					  /* Fills cache for the row m */
{
	register DOC *ex;
	register long j,k,l;
	register CFLOAT *cache;

	if(!kernel_cache_check(kernel_cache,m)) {  /* not cached yet*/
		cache = kernel_cache_clean_and_malloc(kernel_cache,m);
		if(cache) {
			l=kernel_cache->totdoc2active[m];
			ex=docs[m];
			for(j=0;j<kernel_cache->activenum;j++) {  /* fill cache */
				k=kernel_cache->active2totdoc[j];
				if((kernel_cache->index[k] != -1) && (l != -1) && (k != m)) {
					cache[j]=kernel_cache->buffer[kernel_cache->activenum
						*kernel_cache->index[k]+l];
				}
				else {
					cache[j]=kernel(kernel_parm,ex,docs[k]);
				} 
			}
		}
		else {
			perror("Error: Kernel cache full! => increase cache size");
		}
	}
}


void cache_multiple_kernel_rows(KERNEL_CACHE *kernel_cache, DOC **docs, 
								long int *key, long int varnum, 
								KERNEL_PARM *kernel_parm)
								/* Fills cache for the rows in key */
{
	register long i;

	for(i=0;i<varnum;i++) {  /* fill up kernel cache */
		cache_kernel_row(kernel_cache,docs,key[i],kernel_parm);
	}
}


void kernel_cache_shrink(KERNEL_CACHE *kernel_cache, long int totdoc, 
						 long int numshrink, long int *after)
						 /* Remove numshrink columns in the cache which correspond to
						 examples marked 0 in after. */
{
	register long i,j,jj,from=0,to=0,scount;  
	long *keep;

	if(verbosity>=2) {
		printf(" Reorganizing cache..."); fflush(stdout);
	}

	keep=(long *)my_malloc(sizeof(long)*totdoc);
	for(j=0;j<totdoc;j++) {
		keep[j]=1;
	}
	scount=0;
	for(jj=0;(jj<kernel_cache->activenum) && (scount<numshrink);jj++) {
		j=kernel_cache->active2totdoc[jj];
		if(!after[j]) {
			scount++;
			keep[j]=0;
		}
	}

	for(i=0;i<kernel_cache->max_elems;i++) {
		for(jj=0;jj<kernel_cache->activenum;jj++) {
			j=kernel_cache->active2totdoc[jj];
			if(!keep[j]) {
				from++;
			}
			else {
				kernel_cache->buffer[to]=kernel_cache->buffer[from];
				to++;
				from++;
			}
		}
	}

	kernel_cache->activenum=0;
	for(j=0;j<totdoc;j++) {
		if((keep[j]) && (kernel_cache->totdoc2active[j] != -1)) {
			kernel_cache->active2totdoc[kernel_cache->activenum]=j;
			kernel_cache->totdoc2active[j]=kernel_cache->activenum;
			kernel_cache->activenum++;
		}
		else {
			kernel_cache->totdoc2active[j]=-1;
		}
	}

	kernel_cache->max_elems=(long)(kernel_cache->buffsize/kernel_cache->activenum);
	if(kernel_cache->max_elems>totdoc) {
		kernel_cache->max_elems=totdoc;
	}

	free(keep);

	if(verbosity>=2) {
		printf("done.\n"); fflush(stdout);
		printf(" Cache-size in rows = %ld\n",kernel_cache->max_elems);
	}
}

KERNEL_CACHE *kernel_cache_init(long int totdoc, long int buffsize)
{
	long i;
	KERNEL_CACHE *kernel_cache;

	kernel_cache=(KERNEL_CACHE *)my_malloc(sizeof(KERNEL_CACHE));
	kernel_cache->index = (long *)my_malloc(sizeof(long)*totdoc);
	kernel_cache->occu = (long *)my_malloc(sizeof(long)*totdoc);
	kernel_cache->lru = (long *)my_malloc(sizeof(long)*totdoc);
	kernel_cache->invindex = (long *)my_malloc(sizeof(long)*totdoc);
	kernel_cache->active2totdoc = (long *)my_malloc(sizeof(long)*totdoc);
	kernel_cache->totdoc2active = (long *)my_malloc(sizeof(long)*totdoc);
	kernel_cache->buffer = (CFLOAT *)my_malloc((size_t)(buffsize)*1024*1024);

	kernel_cache->buffsize=(long)(buffsize/sizeof(CFLOAT)*1024*1024);

	kernel_cache->max_elems=(long)(kernel_cache->buffsize/totdoc);
	if(kernel_cache->max_elems>totdoc) {
		kernel_cache->max_elems=totdoc;
	}

	if(verbosity>=2) {
		printf(" Cache-size in rows = %ld\n",kernel_cache->max_elems);
		printf(" Kernel evals so far: %ld\n",kernel_cache_statistic);    
	}

	kernel_cache->elems=0;   /* initialize cache */
	for(i=0;i<totdoc;i++) {
		kernel_cache->index[i]=-1;
		kernel_cache->lru[i]=0;
	}
	for(i=0;i<totdoc;i++) {
		kernel_cache->occu[i]=0;
		kernel_cache->invindex[i]=-1;
	}

	kernel_cache->activenum=totdoc;;
	for(i=0;i<totdoc;i++) {
		kernel_cache->active2totdoc[i]=i;
		kernel_cache->totdoc2active[i]=i;
	}

	kernel_cache->time=0;  

	return(kernel_cache);
} 

void kernel_cache_reset_lru(KERNEL_CACHE *kernel_cache)
{
	long maxlru=0,k;

	for(k=0;k<kernel_cache->max_elems;k++) {
		if(maxlru < kernel_cache->lru[k]) 
			maxlru=kernel_cache->lru[k];
	}
	for(k=0;k<kernel_cache->max_elems;k++) {
		kernel_cache->lru[k]-=maxlru;
	}
}

void kernel_cache_cleanup(KERNEL_CACHE *kernel_cache)
{
	free(kernel_cache->index);
	free(kernel_cache->occu);
	free(kernel_cache->lru);
	free(kernel_cache->invindex);
	free(kernel_cache->active2totdoc);
	free(kernel_cache->totdoc2active);
	free(kernel_cache->buffer);
	free(kernel_cache);
}

long kernel_cache_malloc(KERNEL_CACHE *kernel_cache)
{
	long i;

	if(kernel_cache_space_available(kernel_cache)) {
		for(i=0;i<kernel_cache->max_elems;i++) {
			if(!kernel_cache->occu[i]) {
				kernel_cache->occu[i]=1;
				kernel_cache->elems++;
				return(i);
			}
		}
	}
	return(-1);
}

void kernel_cache_free(KERNEL_CACHE *kernel_cache, long int i)
{
	kernel_cache->occu[i]=0;
	kernel_cache->elems--;
}

long kernel_cache_free_lru(KERNEL_CACHE *kernel_cache) 
/* remove least recently used cache element */
{                                     
	register long k,least_elem=-1,least_time;

	least_time=kernel_cache->time+1;
	for(k=0;k<kernel_cache->max_elems;k++) {
		if(kernel_cache->invindex[k] != -1) {
			if(kernel_cache->lru[k]<least_time) {
				least_time=kernel_cache->lru[k];
				least_elem=k;
			}
		}
	}
	if(least_elem != -1) {
		kernel_cache_free(kernel_cache,least_elem);
		kernel_cache->index[kernel_cache->invindex[least_elem]]=-1;
		kernel_cache->invindex[least_elem]=-1;
		return(1);
	}
	return(0);
}


CFLOAT *kernel_cache_clean_and_malloc(KERNEL_CACHE *kernel_cache, 
									  long int docnum)
									  /* Get a free cache entry. In case cache is full, the lru element
									  is removed. */
{
	long result;
	if((result = kernel_cache_malloc(kernel_cache)) == -1) {
		if(kernel_cache_free_lru(kernel_cache)) {
			result = kernel_cache_malloc(kernel_cache);
		}
	}
	kernel_cache->index[docnum]=result;
	if(result == -1) {
		return(0);
	}
	kernel_cache->invindex[result]=docnum;
	kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */
	return((CFLOAT *)((long)kernel_cache->buffer
		+(kernel_cache->activenum*sizeof(CFLOAT)*
		kernel_cache->index[docnum])));
}

long kernel_cache_touch(KERNEL_CACHE *kernel_cache, long int docnum)
/* Update lru time to avoid removal from cache. */
{
	if(kernel_cache && kernel_cache->index[docnum] != -1) {
		kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */
		return(1);
	}
	return(0);
}

long kernel_cache_check(KERNEL_CACHE *kernel_cache, long int docnum)
/* Is that row cached? */
{
	return(kernel_cache->index[docnum] != -1);
}

long kernel_cache_space_available(KERNEL_CACHE *kernel_cache)
/* Is there room for one more row? */
{
	return(kernel_cache->elems < kernel_cache->max_elems);
}

/************************** Compute estimates ******************************/

void compute_xa_estimates(MODEL *model, long int *label, 
						  long int *unlabeled, long int totdoc, 
						  DOC **docs, double *lin, double *a, 
						  KERNEL_PARM *kernel_parm, 
						  LEARN_PARM *learn_parm, double *error, 
						  double *recall, double *precision) 
						  /* Computes xa-estimate of error rate, recall, and precision. See
						  T. Joachims, Estimating the Generalization Performance of an SVM
						  Efficiently, IMCL, 2000. */
{
	long i,looerror,looposerror,loonegerror;
	long totex,totposex;
	double xi,r_delta,r_delta_sq,sim=0;
	long *sv2dnum=NULL,*sv=NULL,svnum;

	r_delta=estimate_r_delta(docs,totdoc,kernel_parm); 
	r_delta_sq=r_delta*r_delta;

	looerror=0;
	looposerror=0;
	loonegerror=0;
	totex=0;
	totposex=0;
	svnum=0;

	if(learn_parm->xa_depth > 0) {
		sv = (long *)my_malloc(sizeof(long)*(totdoc+11));
		for(i=0;i<totdoc;i++) 
			sv[i]=0;
		for(i=1;i<model->sv_num;i++) 
			if(a[model->supvec[i]->docnum] 
			< (learn_parm->svm_cost[model->supvec[i]->docnum]
			-learn_parm->epsilon_a)) {
				sv[model->supvec[i]->docnum]=1;
				svnum++;
			}
			sv2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
			clear_index(sv2dnum);
			compute_index(sv,totdoc,sv2dnum);
	}

	for(i=0;i<totdoc;i++) {
		if(unlabeled[i]) {
			/* ignore it */
		}
		else {
			xi=1.0-((lin[i]-model->b)*(double)label[i]);
			if(xi<0) xi=0;
			if(label[i]>0) {
				totposex++;
			}
			if((learn_parm->rho*a[i]*r_delta_sq+xi) >= 1.0) {
				if(learn_parm->xa_depth > 0) {  /* makes assumptions */
					sim=distribute_alpha_t_greedily(sv2dnum,svnum,docs,a,i,label,
						kernel_parm,learn_parm,
						(double)((1.0-xi-a[i]*r_delta_sq)/(2.0*a[i])));
				}
				if((learn_parm->xa_depth == 0) || 
					((a[i]*kernel(kernel_parm,docs[i],docs[i])+a[i]*2.0*sim+xi) >= 1.0)) { 
						looerror++;
						if(label[i]>0) {
							looposerror++;
						}
						else {
							loonegerror++;
						}
				}
			}
			totex++;
		}
	}

	(*error)=((double)looerror/(double)totex)*100.0;
	(*recall)=(1.0-(double)looposerror/(double)totposex)*100.0;
	(*precision)=(((double)totposex-(double)looposerror)
		/((double)totposex-(double)looposerror+(double)loonegerror))*100.0;

	free(sv);
	free(sv2dnum);
}


double distribute_alpha_t_greedily(long int *sv2dnum, long int svnum, 
								   DOC **docs, double *a, 
								   long int docnum, 
								   long int *label, 
								   KERNEL_PARM *kernel_parm, 
								   LEARN_PARM *learn_parm, double thresh)
								   /* Experimental Code improving plain XiAlpha Estimates by
								   computing a better bound using a greedy optimzation strategy. */
{
	long best_depth=0;
	long i,j,k,d,skip,allskip;
	double best,best_val[101],val,init_val_sq,init_val_lin;
	long best_ex[101];
	CFLOAT *cache,*trow;

	cache=(CFLOAT *)my_malloc(sizeof(CFLOAT)*learn_parm->xa_depth*svnum);
	trow = (CFLOAT *)my_malloc(sizeof(CFLOAT)*svnum);

	for(k=0;k<svnum;k++) {
		trow[k]=kernel(kernel_parm,docs[docnum],docs[sv2dnum[k]]);
	}

	init_val_sq=0;
	init_val_lin=0;
	best=0;

	for(d=0;d<learn_parm->xa_depth;d++) {
		allskip=1;
		if(d>=1) {
			init_val_sq+=cache[best_ex[d-1]+svnum*(d-1)]; 
			for(k=0;k<d-1;k++) {
				init_val_sq+=2.0*cache[best_ex[k]+svnum*(d-1)]; 
			}
			init_val_lin+=trow[best_ex[d-1]]; 
		}
		for(i=0;i<svnum;i++) {
			skip=0;
			if(sv2dnum[i] == docnum) skip=1;
			for(j=0;j<d;j++) {
				if(i == best_ex[j]) skip=1;
			}

			if(!skip) {
				val=init_val_sq;
				val+=kernel(kernel_parm,docs[sv2dnum[i]],docs[sv2dnum[i]]);
				for(j=0;j<d;j++) {
					val+=2.0*cache[i+j*svnum];
				}
				val*=(1.0/(2.0*(d+1.0)*(d+1.0)));
				val-=((init_val_lin+trow[i])/(d+1.0));

				if(allskip || (val < best_val[d])) {
					best_val[d]=val;
					best_ex[d]=i;
				}
				allskip=0;
				if(val < thresh) {
					i=svnum;
					/*	  printf("EARLY"); */
				}
			}
		}
		if(!allskip) {
			for(k=0;k<svnum;k++) {
				cache[d*svnum+k]=kernel(kernel_parm,
					docs[sv2dnum[best_ex[d]]],
					docs[sv2dnum[k]]);
			}
		}
		if((!allskip) && ((best_val[d] < best) || (d == 0))) {
			best=best_val[d];
			best_depth=d;
		}
		if(allskip || (best < thresh)) {
			d=learn_parm->xa_depth;
		}
	}    

	free(cache);
	free(trow);

	/*  printf("Distribute[%ld](%ld)=%f, ",docnum,best_depth,best); */
	return(best);
}


void estimate_transduction_quality(MODEL *model, long int *label, 
								   long int *unlabeled, 
								   long int totdoc, DOC **docs, double *lin) 
								   /* Loo-bound based on observation that loo-errors must have an
								   equal distribution in both training and test examples, given
								   that the test examples are classified correctly. Compare
								   chapter "Constraints on the Transductive Hyperplane" in my
								   Dissertation. */
{
	long i,j,l=0,ulab=0,lab=0,labpos=0,labneg=0,ulabpos=0,ulabneg=0,totulab=0;
	double totlab=0,totlabpos=0,totlabneg=0,labsum=0,ulabsum=0;
	double r_delta,r_delta_sq,xi,xisum=0,asum=0;

	r_delta=estimate_r_delta(docs,totdoc,&(model->kernel_parm)); 
	r_delta_sq=r_delta*r_delta;

	for(j=0;j<totdoc;j++) {
		if(unlabeled[j]) {
			totulab++;
		}
		else {
			totlab++;
			if(label[j] > 0) 
				totlabpos++;
			else 
				totlabneg++;
		}
	}
	for(j=1;j<model->sv_num;j++) {
		i=model->supvec[j]->docnum;
		xi=1.0-((lin[i]-model->b)*(double)label[i]);
		if(xi<0) xi=0;

		xisum+=xi;
		asum+=fabs(model->alpha[j]);
		if(unlabeled[i]) {
			ulabsum+=(fabs(model->alpha[j])*r_delta_sq+xi);
		}
		else {
			labsum+=(fabs(model->alpha[j])*r_delta_sq+xi);
		}
		if((fabs(model->alpha[j])*r_delta_sq+xi) >= 1) { 
			l++;
			if(unlabeled[model->supvec[j]->docnum]) {
				ulab++;
				if(model->alpha[j] > 0) 
					ulabpos++;
				else 
					ulabneg++;
			}
			else {
				lab++;
				if(model->alpha[j] > 0) 
					labpos++;
				else 
					labneg++;
			}
		}
	}
	printf("xacrit>=1: labeledpos=%.5f labeledneg=%.5f default=%.5f\n",(double)labpos/(double)totlab*100.0,(double)labneg/(double)totlab*100.0,(double)totlabpos/(double)(totlab)*100.0);
	printf("xacrit>=1: unlabelpos=%.5f unlabelneg=%.5f\n",(double)ulabpos/(double)totulab*100.0,(double)ulabneg/(double)totulab*100.0);
	printf("xacrit>=1: labeled=%.5f unlabled=%.5f all=%.5f\n",(double)lab/(double)totlab*100.0,(double)ulab/(double)totulab*100.0,(double)l/(double)(totdoc)*100.0);
	printf("xacritsum: labeled=%.5f unlabled=%.5f all=%.5f\n",(double)labsum/(double)totlab*100.0,(double)ulabsum/(double)totulab*100.0,(double)(labsum+ulabsum)/(double)(totdoc)*100.0);
	printf("r_delta_sq=%.5f xisum=%.5f asum=%.5f\n",r_delta_sq,xisum,asum);
}

double estimate_margin_vcdim(MODEL *model, double w, double R) 
/* optional: length of model vector in feature space */
/* optional: radius of ball containing the data */
{
	double h;

	/* follows chapter 5.6.4 in [Vapnik/95] */

	if(w<0) {
		w=model_length_s(model);
	}
	if(R<0) {
		R=estimate_sphere(model); 
	}
	h = w*w * R*R +1; 
	return(h);
}

double estimate_sphere(MODEL *model) 
/* Approximates the radius of the ball containing */
/* the support vectors by bounding it with the */
{                         /* length of the longest support vector. This is */
	register long j;        /* pretty good for text categorization, since all */
	double xlen,maxxlen=0;  /* documents have feature vectors of length 1. It */
	DOC *nulldoc;           /* assumes that the center of the ball is at the */
	TOKEN nullword;          /* origin of the space. */
	KERNEL_PARM *kernel_parm=&(model->kernel_parm);

	nullword.wnum=0;
	nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); 

	for(j=1;j<model->sv_num;j++) {
		xlen=sqrt(kernel(kernel_parm,model->supvec[j],model->supvec[j])
			-2*kernel(kernel_parm,model->supvec[j],nulldoc)
			+kernel(kernel_parm,nulldoc,nulldoc));
		if(xlen>maxxlen) {
			maxxlen=xlen;
		}
	}

	free_example(nulldoc,1);
	return(maxxlen);
}

double estimate_r_delta(DOC **docs, long int totdoc, KERNEL_PARM *kernel_parm)
{
	long i;
	double maxxlen,xlen;
	DOC *nulldoc;           /* assumes that the center of the ball is at the */
	TOKEN nullword;          /* origin of the space. */

	nullword.wnum=0;
	nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); 

	maxxlen=0;
	for(i=0;i<totdoc;i++) {
		xlen=sqrt(kernel(kernel_parm,docs[i],docs[i])
			-2*kernel(kernel_parm,docs[i],nulldoc)
			+kernel(kernel_parm,nulldoc,nulldoc));
		if(xlen>maxxlen) {
			maxxlen=xlen;
		}
	}

	free_example(nulldoc,1);
	return(maxxlen);
}

double estimate_r_delta_average(DOC **docs, long int totdoc, 
								KERNEL_PARM *kernel_parm)
{
	long i;
	double avgxlen;
	DOC *nulldoc;           /* assumes that the center of the ball is at the */
	TOKEN nullword;          /* origin of the space. */

	nullword.wnum=0;
	nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); 

	avgxlen=0;
	for(i=0;i<totdoc;i++) {
		avgxlen+=sqrt(kernel(kernel_parm,docs[i],docs[i])
			-2*kernel(kernel_parm,docs[i],nulldoc)
			+kernel(kernel_parm,nulldoc,nulldoc));
	}

	free_example(nulldoc,1);
	return(avgxlen/totdoc);
}

double length_of_longest_document_vector(DOC **docs, long int totdoc, 
										 KERNEL_PARM *kernel_parm)
{
	long i;
	double maxxlen,xlen;

	maxxlen=0;
	for(i=0;i<totdoc;i++) {
		xlen=sqrt(kernel(kernel_parm,docs[i],docs[i]));
		if(xlen>maxxlen) {
			maxxlen=xlen;
		}
	}

	return(maxxlen);
}

/****************************** IO-handling **********************************/

void write_prediction(char *predfile, MODEL *model, double *lin, 
					  double *a, long int *unlabeled, 
					  long int *label, long int totdoc, 
					  LEARN_PARM *learn_parm)
{
	FILE *predfl;
	long i;
	double dist,a_max;

	if(verbosity>=1) {
		printf("Writing prediction file..."); fflush(stdout);
	}
	if ((predfl = fopen (predfile, "w")) == NULL)
	{ perror (predfile); exit (1); }
	a_max=learn_parm->epsilon_a;
	for(i=0;i<totdoc;i++) {
		if((unlabeled[i]) && (a[i]>a_max)) {
			a_max=a[i];
		}
	}
	for(i=0;i<totdoc;i++) {
		if(unlabeled[i]) {
			if((a[i]>(learn_parm->epsilon_a))) {
				dist=(double)label[i]*(1.0-learn_parm->epsilon_crit-a[i]/(a_max*2.0));
			}
			else {
				dist=(lin[i]-model->b);
			}
			if(dist>0) {
				fprintf(predfl,"%.8g:+1 %.8g:-1\n",dist,-dist);
			}
			else {
				fprintf(predfl,"%.8g:-1 %.8g:+1\n",-dist,dist);
			}
		}
	}
	fclose(predfl);
	if(verbosity>=1) {
		printf("done\n");
	}
}

void write_alphas(char *alphafile, double *a, 
				  long int *label, long int totdoc)
{
	FILE *alphafl;
	long i;

	if(verbosity>=1) {
		printf("Writing alpha file..."); fflush(stdout);
	}
	if ((alphafl = fopen (alphafile, "w")) == NULL)
	{ perror (alphafile); exit (1); }
	for(i=0;i<totdoc;i++) {
		fprintf(alphafl,"%.18g\n",a[i]*(double)label[i]);
	}
	fclose(alphafl);
	if(verbosity>=1) {
		printf("done\n");
	}
}

